Skip to content

Commit

Permalink
sentiment and metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
reality committed Oct 3, 2021
1 parent 8536458 commit f37b5da
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 31 deletions.
2 changes: 2 additions & 0 deletions src/main/groovy/komenti/App.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class App {
_ longOpt: 'label-extension', 'Run a named label extension, e.g. cmo', args: 1
_ longOpt: 'direct', 'Receive only direct super/subclasses from the DL query. Default is false.', type: Boolean
_ longOpt: 'class-mode', 'Return only one label per matching IRI.', type: Boolean
_ longOpt: 'field', 'Return only this metadata field', args: 1

// annotation options
t longOpt: 'text', 'A file or directory of files to annotate.', args: 1
Expand All @@ -33,6 +34,7 @@ class App {
_ longOpt: 'per-line', 'Process each line of each file seperately (useful for field-based data e.g. downloaded with get_metadata)', type: Boolean
_ longOpt: 'disable-modifiers', 'Don\'t evaluate negation and uncertainty. The reason for this is: it takes a lot of time!', type: Boolean
_ longOpt: 'family-modifier', 'Evaluate sentences for whether or not they mention a family member.', type: Boolean
_ longOpt: 'sentiment', 'Get sentiment score for annotations', type: Boolean
_ longOpt: 'allergy-modifier', 'Evaluate sentences for whether or not they mention an allergy', type: Boolean
_ longOpt: 'exclude', 'A list of phrases, which when matched in a sentence, will cause that sentence not to be annotated. One phrase per line.', args: 1
_ longOpt: 'write-pdfs-to-dir', 'If set, write the converted PDF text into the given directory.', args: 1
Expand Down
17 changes: 12 additions & 5 deletions src/main/groovy/komenti/Komenti.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,13 @@ public class Komenti {

static def get_metadata(o) {
def outDir = getOutDir(o)
def files = [:]
ConcurrentHashMap files = [:]

def vocab = Vocabulary.loadFile(o.l)
def komentisto = new Komentisto(vocab,
o['disable-modifiers'],
o['family-modifier'],
o['sentiment'],
o['allergy-modifier'],
false,
o['exclude'],
Expand All @@ -155,17 +156,22 @@ public class Komenti {

if(!o['decompose-entities']) { entityLabels = [] }

classLabels.each { iri, l ->
def i = 0
GParsPool.withPool(o['threads'] ?: 1) { p ->
classLabels.eachParallel{ iri, l ->
println "(${++i}/${classLabels.size()})"
KomentLib.AOSemanticQuery("<$iri>", l.o, false, "equivalent", { classes ->
// we want the actual class, not just semantically equivalent ones. although tbh it might be better to get the metadata from those too. it has to be semantically equivalent to this class, after all
def c = classes.find { it.class == iri }
def metadata = KomentLib.AOExtractMetadata(c, entityLabels)
def metadata = KomentLib.AOExtractMetadata(c, entityLabels, o['field'])
if(o['lemmatise']) { // we do it per line here, since it's a field based document
metadata = metadata.split('\n').collect { komentisto.lemmatise(it) }.join('\n')
}
files[l.l[0]] = metadata
// TODO this is bad and lazy
files[iri.tokenize('/').last()] = metadata
})
}
}

println "Writing metadata files for ${files.size()} classes."
files.each { n, c ->
Expand Down Expand Up @@ -196,6 +202,7 @@ public class Komenti {
def komentisto = new Komentisto(vocab,
o['disable-modifiers'],
o['family-modifier'],
o['sentiment'],
o['allergy-modifier'],
o['extract-triples'],
o['exclude'],
Expand Down Expand Up @@ -373,7 +380,7 @@ public class Komenti {
if(o['id-list-only']) {
writeOutput(aids.join('\n'), o, "Saved pmcids to $o.out!")
} else {
def komentisto = new Komentisto(false, true, o['family-modifier'], o['allergy-modifier'], false, o['exclude'], o['threads'] ?: 1)
def komentisto = new Komentisto(false, true, o['family-modifier'], o['sentiment'], o['allergy-modifier'], false, o['exclude'], o['threads'] ?: 1)
def abstracts = []
aids.each { pmcid ->
KomentLib.PMCGetAbstracts(pmcid, { a ->
Expand Down
55 changes: 34 additions & 21 deletions src/main/groovy/komenti/klib/KomentLib.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class KomentLib {
} else {
println "Error code: ${err.getStatusCode()}"
}
System.exit(1)
//System.exit(1)
}

static def AOQueryNames(query, cb) {
Expand All @@ -39,6 +39,7 @@ class KomentLib {
cb(json.result)
}
} catch(e) {
println "Error query: $query"
AOAPIErrorHandler(e)
}
}
Expand Down Expand Up @@ -115,32 +116,44 @@ class KomentLib {
}

// metadata to text
static def AOExtractMetadata(c, dLabels) {
static def AOExtractMetadata(c, dLabels, field) {
def out = ''
c.each { k, v ->
if(k == 'SubClassOf') { return; }
if(k.length() > 30) { return; } // try to remove some of the bugprops
if(v instanceof Collection) {
out += "$k:\n"
v.unique(false).each {
out += " $it\n"

def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 }
if(dec) {
dec.each { d ->
out += " (decomposed) ${it.replace(d, '')}\n"
out += " (decomposed): ${d}\n"
if(field) {
if(k == field) {
if(v instanceof Collection) {
v.unique(false).each {
out += "$it\n"
}
} else {
out += "$v\n"
}
}
}
} else {
out += "$k: $v\n"
if(k == 'SubClassOf') { return; }
if(k.length() > 30) { return; } // try to remove some of the bugprops
if(v instanceof Collection) {
out += "$k:\n"
v.unique(false).each {
out += " $it\n"

def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 }
if(dec) {
dec.each { d ->
out += " (decomposed) ${it.replace(d, '')}\n"
out += " (decomposed): ${d}\n"
}
}
}
} else {
out += "$k: $v\n"

def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 }
if(dec) {
dec.each { d ->
out += "$k (decomposed): ${v.replace(d, '')}\n"
out += "$k (decomposed): ${d}\n"
def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 }
if(dec) {
dec.each { d ->
out += "$k (decomposed): ${v.replace(d, '')}\n"
out += "$k (decomposed): ${d}\n"
}
}
}
}
Expand Down
27 changes: 22 additions & 5 deletions src/main/groovy/komenti/klib/Komentisto.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import edu.stanford.nlp.semgraph.*
import edu.stanford.nlp.ie.util.RelationTriple
import edu.stanford.nlp.util.*
import edu.stanford.nlp.naturalli.*
import edu.stanford.nlp.sentiment.*
import edu.stanford.nlp.neural.rnn.*

public class Komentisto {
def REP_TOKEN = 'biscuit'
Expand All @@ -23,8 +25,9 @@ public class Komentisto {
def enableIE
def vocabulary
def threads
def sentiment

def Komentisto(vocabulary, disableModifiers, familyModifier, allergyModifier, enableIE, excludeFile, threads) {
def Komentisto(vocabulary, disableModifiers, familyModifier, sentiment, allergyModifier, enableIE, excludeFile, threads) {
this.vocabulary = vocabulary

uncertainTerms = UNC_WORDS_FILE.getText().split('\n')
Expand All @@ -39,6 +42,7 @@ public class Komentisto {
this.allergyModifier = allergyModifier
this.enableIE = enableIE
this.threads = threads
this.sentiment = sentiment

initialiseCoreNLP()
}
Expand All @@ -54,6 +58,7 @@ public class Komentisto {
aList.removeAll(["ner", "regexner", "entitymentions"])
}
if(enableIE) { aList += ["depparse", "natlog", "openie"] }
if(sentiment) { aList += [ "parse", "sentiment" ] }
println aList
props.put("annotators", aList.join(', '))

Expand Down Expand Up @@ -84,7 +89,7 @@ public class Komentisto {
def aDocument = new edu.stanford.nlp.pipeline.Annotation(text.toLowerCase())

// TODO I think we may be able to use the 'Annotator.Requirement' class to determine what needs to be run
[ "tokenize", "ssplit", "ner", "regexner", "entitymentions" ].each {
[ "tokenize", "ssplit", "ner", "regexner", "entitymentions", "parse", "sentiment" ].each {
coreNLP.getExistingAnnotator(it).annotate(aDocument)
}

Expand Down Expand Up @@ -114,6 +119,20 @@ public class Komentisto {
if(!disableModifiers) {
def tags = evaluateSentenceConcept(sentence, ner) // add all tags that returned true
a.tags = tags.findAll { it.getValue() }.collect { it.getKey() }

// Thanks for helping me figure this out!! <3 https://github.com/Ruthwik/Sentiment-Analysis/
if(sentiment) {
def tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
def sm = RNNCoreAnnotations.getPredictions(tree);
def sentimentType = sentence.get(SentimentCoreAnnotations.SentimentClass.class);

a.tags << "SentimentClass:$sentimentType"
a.tags << "S:VP:${(double)Math.round(sm.get(4) * 100d)}"
a.tags << "S:P:${(double)Math.round(sm.get(3) * 100d)}"
a.tags << "S:NEUT:${(double)Math.round(sm.get(2) * 100d)}"
a.tags << "S:N:${(double)Math.round(sm.get(1) * 100d)}"
a.tags << "S:VN:${(double)Math.round(sm.get(0) * 100d)}"
}
}

results << a
Expand Down Expand Up @@ -216,9 +235,7 @@ public class Komentisto {

if(allergyModifier) {
out.allergy = text =~ ALLERGY_PATTERN
}

out
}
}

def lemmatise(text) {
Expand Down

0 comments on commit f37b5da

Please sign in to comment.