Skip to content

Commit

Permalink
reform file-list, some vocab changes, ability to save pdf conversions…
Browse files Browse the repository at this point in the history
… to txt.
  • Loading branch information
reality committed May 19, 2020
1 parent eb20562 commit 0c6d340
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ It enables querying multiple ontologies with complex class descriptions using Ab

## Installation

You can find the latest release here: https://github.com/reality/komenti/releases/tag/0.0.4-SNAPSHOT-2
You can find the latest release here: https://github.com/reality/komenti/releases/tag/0.0.4-SNAPSHOT-3

You can add the bin/ directory to your PATH, to be able to use it easily from anywhere. It should also work on Windows, but I haven't tested that.

Expand Down
1 change: 1 addition & 0 deletions src/main/groovy/komenti/App.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class App {
_ longOpt: 'disable-modifiers', 'Don\'t evaluate negation and uncertainty. The reason for this is: it takes a lot of time!', type: Boolean
_ longOpt: 'family-modifier', 'Evaluate sentences for whether or not they mention a family member.', type: Boolean
_ longOpt: 'exclude', 'A list of phrases, which when matched in a sentence, will cause that sentence not to be annotated. One phrase per line.', args: 1
_ longOpt: 'write-pdfs-to-dir', 'If set, write the converted PDF text into the given directory.', args: 1

// summary options
a longOpt: 'annotation-file', 'Annotation file to summarise', args: 1
Expand Down
35 changes: 22 additions & 13 deletions src/main/groovy/komenti/Komenti.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ public class Komenti {

println "Done"
} else if(command == 'annotate') {
if(!o.t || !o.l) { cliBuilder.usage() ; System.exit(1) }
if((!o.t && !o['file-list']) || !o.l) { cliBuilder.usage() ; System.exit(1) }
if(!o.out) { println "Must provide output filename via --out" ; System.exit(1) }

def vocab = Vocabulary.loadFile(o.l)
Expand All @@ -196,20 +196,22 @@ public class Komenti {

def outWriter = new BufferedWriter(new FileWriter(o.out))

// TODO move this somewhere else
def target = new File(o.t)
def processFileOrDir
processFileOrDir = { f, item ->
if(item.isDirectory()) {
item.eachFile { processFileOrDir(f, it) }
} else {
if(!fList || (fList && fList.contains(item.getName()))) {
f << item
def files = fList
if(o.t) {
def target = new File(o.t)
def processFileOrDir
processFileOrDir = { f, item ->
if(item.isDirectory()) {
item.eachFile { processFileOrDir(f, it) }
} else {
if(!fList || (fList && fList.contains(item.getName()))) {
f << item
}
}
f
}
f
files = processFileOrDir([], target)
}
def files = processFileOrDir([], target)

println "Annotating ${files.size()} files ..."
def komentisto = new Komentisto(vocab,
Expand All @@ -222,7 +224,14 @@ public class Komenti {
GParsPool.withPool(o['threads'] ?: 1) { p ->
files.eachParallel{ f ->
def (name, text) = [f.getName(), f.text]
if(name =~ /(?i)pdf/) { text = new PDFReader(f).getText() }
if(name =~ /(?i)pdf/) {
text = new PDFReader(f).getText()
if(o['write-pdfs-to-dir']) {
def dir = new File(o['write-pdfs-to-dir'])
if(!dir.exists()) { dir.mkdir() }
new File(dir, f.getName() + '.txt').text = text
}
}

def annotations
if(o['per-line']) {
Expand Down
2 changes: 1 addition & 1 deletion src/main/groovy/komenti/klib/Komentisto.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class Komentisto {
}

def addRegexNERProps(props) {
props.put("regexner.mapping", vocabulary.labelFile.getAbsolutePath())
props.put("regexner.mapping", new File(vocabulary.labelPath).getAbsolutePath())
props.put("regexner.mapping.header", "pattern,ner,q,ontology,priority") // wtf
props.put("regexner.mapping.field.q", 'edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation') // wtf
props.put("regexner.mapping.field.ontology",
Expand Down
7 changes: 5 additions & 2 deletions src/main/groovy/komenti/klib/PDFReader.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@ public class PDFReader {

text = text.replaceAll('\u2022', '. ')
text = text.replaceAll('–', '. ')
text = text.replaceAll('-', '. ')
text = text.replaceAll('', '. ')
text = text.replaceAll('\b-', '. ')
text = text.replaceAll('\b', '. ')
text = text.replaceAll('-\b', '. ')
text = text.replaceAll('\b', '. ')
text = text.replaceAll('\\s+', ' ')
text = text.replaceAll(', \\?', '. ?')
text = text.replaceAll('\\.', '. ')
text = text.replaceAll('\n\n', '. ')

pages << text
}
Expand Down
7 changes: 1 addition & 6 deletions src/main/resources/words/uncertain.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ rarely
regularly
should
tends
usually
abeyance
abeyances
almost
Expand Down Expand Up @@ -286,8 +285,6 @@ speculative
speculatively
sporadic
sporadically
sudden
suddenly
susceptibility
tending
tentative
Expand Down Expand Up @@ -369,13 +366,11 @@ investigation
investigations
monitor
monitored
development
dissociate
insufficient
future
analogous
develop
suggested
look for
imply
unless
may develop

0 comments on commit 0c6d340

Please sign in to comment.