From 0c6d340165f1d82820c8a341e43bc4f639091192 Mon Sep 17 00:00:00 2001 From: Luke Slater Date: Tue, 19 May 2020 23:12:57 +0100 Subject: [PATCH] reform file-list, some vocab changes, ability to save pdf conversions to txt. --- README.md | 2 +- src/main/groovy/komenti/App.groovy | 1 + src/main/groovy/komenti/Komenti.groovy | 35 ++++++++++++------- .../groovy/komenti/klib/Komentisto.groovy | 2 +- src/main/groovy/komenti/klib/PDFReader.groovy | 7 ++-- src/main/resources/words/uncertain.txt | 7 +--- 6 files changed, 31 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 77ef6ca..2f704be 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ It enables querying multiple ontologies with complex class descriptions using Ab ## Installation -You can find the latest release here: https://github.com/reality/komenti/releases/tag/0.0.4-SNAPSHOT-2 +You can find the latest release here: https://github.com/reality/komenti/releases/tag/0.0.4-SNAPSHOT-3 You can add the bin/ directory to your PATH, to be able to use it easily from anywhere. It should also work on Windows, but I haven't tested that. diff --git a/src/main/groovy/komenti/App.groovy b/src/main/groovy/komenti/App.groovy index 579e13d..317f31e 100644 --- a/src/main/groovy/komenti/App.groovy +++ b/src/main/groovy/komenti/App.groovy @@ -31,6 +31,7 @@ class App { _ longOpt: 'disable-modifiers', 'Don\'t evaluate negation and uncertainty. The reason for this is: it takes a lot of time!', type: Boolean _ longOpt: 'family-modifier', 'Evaluate sentences for whether or not they mention a family member.', type: Boolean _ longOpt: 'exclude', 'A list of phrases, which when matched in a sentence, will cause that sentence not to be annotated. One phrase per line.', args: 1 + _ longOpt: 'write-pdfs-to-dir', 'If set, write the converted PDF text into the given directory.', args: 1 // summary options a longOpt: 'annotation-file', 'Annotation file to summarise', args: 1 diff --git a/src/main/groovy/komenti/Komenti.groovy b/src/main/groovy/komenti/Komenti.groovy index 6a13d47..212de48 100644 --- a/src/main/groovy/komenti/Komenti.groovy +++ b/src/main/groovy/komenti/Komenti.groovy @@ -184,7 +184,7 @@ public class Komenti { println "Done" } else if(command == 'annotate') { - if(!o.t || !o.l) { cliBuilder.usage() ; System.exit(1) } + if((!o.t && !o['file-list']) || !o.l) { cliBuilder.usage() ; System.exit(1) } if(!o.out) { println "Must provide output filename via --out" ; System.exit(1) } def vocab = Vocabulary.loadFile(o.l) @@ -196,20 +196,22 @@ public class Komenti { def outWriter = new BufferedWriter(new FileWriter(o.out)) - // TODO move this somewhere else - def target = new File(o.t) - def processFileOrDir - processFileOrDir = { f, item -> - if(item.isDirectory()) { - item.eachFile { processFileOrDir(f, it) } - } else { - if(!fList || (fList && fList.contains(item.getName()))) { - f << item + def files = fList + if(o.t) { + def target = new File(o.t) + def processFileOrDir + processFileOrDir = { f, item -> + if(item.isDirectory()) { + item.eachFile { processFileOrDir(f, it) } + } else { + if(!fList || (fList && fList.contains(item.getName()))) { + f << item + } } + f } - f + files = processFileOrDir([], target) } - def files = processFileOrDir([], target) println "Annotating ${files.size()} files ..." def komentisto = new Komentisto(vocab, @@ -222,7 +224,14 @@ public class Komenti { GParsPool.withPool(o['threads'] ?: 1) { p -> files.eachParallel{ f -> def (name, text) = [f.getName(), f.text] - if(name =~ /(?i)pdf/) { text = new PDFReader(f).getText() } + if(name =~ /(?i)pdf/) { + text = new PDFReader(f).getText() + if(o['write-pdfs-to-dir']) { + def dir = new File(o['write-pdfs-to-dir']) + if(!dir.exists()) { dir.mkdir() } + new File(dir, f.getName() + '.txt').text = text + } + } def annotations if(o['per-line']) { diff --git a/src/main/groovy/komenti/klib/Komentisto.groovy b/src/main/groovy/komenti/klib/Komentisto.groovy index 1c9a115..53e3f79 100644 --- a/src/main/groovy/komenti/klib/Komentisto.groovy +++ b/src/main/groovy/komenti/klib/Komentisto.groovy @@ -58,7 +58,7 @@ public class Komentisto { } def addRegexNERProps(props) { - props.put("regexner.mapping", vocabulary.labelFile.getAbsolutePath()) + props.put("regexner.mapping", new File(vocabulary.labelPath).getAbsolutePath()) props.put("regexner.mapping.header", "pattern,ner,q,ontology,priority") // wtf props.put("regexner.mapping.field.q", 'edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation') // wtf props.put("regexner.mapping.field.ontology", diff --git a/src/main/groovy/komenti/klib/PDFReader.groovy b/src/main/groovy/komenti/klib/PDFReader.groovy index 1d3941a..82e3ef7 100644 --- a/src/main/groovy/komenti/klib/PDFReader.groovy +++ b/src/main/groovy/komenti/klib/PDFReader.groovy @@ -20,11 +20,14 @@ public class PDFReader { text = text.replaceAll('\u2022', '. ') text = text.replaceAll('–', '. ') - text = text.replaceAll('-', '. ') - text = text.replaceAll('– ', '. ') + text = text.replaceAll('\b-', '. ') + text = text.replaceAll('\b–', '. ') + text = text.replaceAll('-\b', '. ') + text = text.replaceAll('–\b', '. ') text = text.replaceAll('\\s+', ' ') text = text.replaceAll(', \\?', '. ?') text = text.replaceAll('\\.', '. ') + text = text.replaceAll('\n\n', '. ') pages << text } diff --git a/src/main/resources/words/uncertain.txt b/src/main/resources/words/uncertain.txt index afcdb38..cb24974 100644 --- a/src/main/resources/words/uncertain.txt +++ b/src/main/resources/words/uncertain.txt @@ -68,7 +68,6 @@ rarely regularly should tends -usually abeyance abeyances almost @@ -286,8 +285,6 @@ speculative speculatively sporadic sporadically -sudden -suddenly susceptibility tending tentative @@ -369,13 +366,11 @@ investigation investigations monitor monitored -development -dissociate insufficient future analogous -develop suggested look for imply unless +may develop