-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLdaLiwc.py
156 lines (135 loc) · 5.99 KB
/
LdaLiwc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# run LDA on texts
# preprocessing steps:
import liwc_entropy as liwcEntropy
import parse_data as parse
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import re
class LdaLiwc:
txt_data_path = '../data/schizophrenia_txt/'
output_path = './ldafeatures.csv'
output_path2 = './ldafeatures2.csv'
mylist = ['just', 'via', 'make', 'can', 'amp', 'get', 'nbsp']
def __init__(self):
self.tokenizer = RegexpTokenizer(r"[@a-z0-9]['a-z0-9]*")
self.p_stemmer = PorterStemmer()
self.en_stop = [self.p_stemmer.stem(i) for i in get_stop_words('en')]
self.topicNum = 20
def getSortedId(self):
userIds = parse.get_user_ids(range(10))
sortedId = sorted(userIds, key=lambda user: parse.get_label(user))
return sortedId
def getSortedDocumentList(self):
self.userDocumentList = [' '.join(open('%s%s.txt' % (self.txt_data_path, user_id))) for user_id in self.getSortedId()]
documentLineList = [open('%s%s.txt' % (self.txt_data_path, user_id)).readlines() for user_id in self.getSortedId()]
tempDocumentList = []
for documentLine in documentLineList:
maxline = 300
for idx, line in enumerate(documentLine):
if idx % maxline == 0:
newDoc = []
tempDocumentList.append(newDoc)
else:
newDoc.append(line)
documentList = [' '.join(tempDocument) for tempDocument in tempDocumentList]
print 'document num:', len(documentList)
return documentList
def runLDA(self):
documentList = self.getSortedDocumentList()
print 'document loaded'
tokensList = self.tokenized(documentList)
print 'tokenized'
filterList = self.preprocessing(tokensList)
self.dictionary = corpora.Dictionary(filterList)
print 'dictionary built'
self.corpus = [self.dictionary.doc2bow(text) for text in filterList]
# self.tfidf = models.TfidfModel(self.corpus)
# print 'using tfidf'
self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.topicNum, id2word=self.dictionary, passes=20)
print 'finished training LDA'
print 'process original corpus'
self.processOriginal()
def processOriginal(self):
tokensList = self.tokenized(self.userDocumentList)
print 'tokenized'
filterList = self.preprocessing(tokensList)
self.corpusTotal = [self.dictionary.doc2bow(text) for text in filterList]
def preprocessing(self, tokensList):
# filterList = self.filterLIWC(tokensList)
# print 'filter by LIWC'
# print filterList[0][:20]
tokensList = self.stemming(tokensList)
print 'stemming done'
tokensList = self.removeStopWord(tokensList)
print 'stop word removed'
print tokensList[5][:50]
print tokensList[20][:50]
return tokensList
def outputLDAfeature(self):
sortedId = self.getSortedId()
# featuresList = [self.ldamodel.get_document_topics(self.tfidf[self.corpus[documentId]]) for documentId in range(len(sortedId))]
# with open(self.output_path, 'w') as file:
# for userId, features in zip(sortedId, featuresList):
# file.write(userId)
# featuresVec = [0 for i in range(self.topicNum)]
# for dId, feature in features:
# featuresVec[dId] = feature
# for feature in featuresVec:
# file.write(", "+str(feature))
# file.write("\n")
featuresList = [self.ldamodel.get_document_topics(self.corpusTotal[documentId]) for documentId in range(len(sortedId))]
with open(self.output_path, 'w') as file:
for userId, features in zip(sortedId, featuresList):
file.write(userId)
featuresVec = [0 for i in range(self.topicNum)]
for dId, feature in features:
featuresVec[dId] = feature
for feature in featuresVec:
file.write(", "+str(feature))
file.write("\n")
def tokenized(self, documentList):
tokensList = []
for document in documentList:
# remove all http and newline and colon
document = re.sub('COLON', ':', document)
document = re.sub('NEWLINE', ' ', document)
document = re.sub(r'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', document)
document = re.sub(r'@[a-zA-Z0-9_]*', '', document)
newDoc = document.lower()
tokens = self.tokenizer.tokenize(newDoc)
tokensList.append(tokens)
return tokensList
def removeStopWord(self, tokensList):
stopList = []
for tokens in tokensList:
stopped_tokens = [i for i in tokens if
i not in self.en_stop and
len(i) >= 3 and
i not in self.mylist]
stopList.append(stopped_tokens)
return stopList
def filterLIWC(self, tokensList):
filterList = []
liwc = liwcEntropy.LiwcEntropy()
for tokens in tokensList:
filterTokens = []
for token in tokens:
for category in liwc.read_token(token):
filterTokens.append(category[0])
break
filterList.append(filterTokens)
return filterList
def stemming(self, tokensList):
stemList = []
for tokens in tokensList:
stemmed_tokens = [self.p_stemmer.stem(i) for i in tokens]
stemList.append(stemmed_tokens)
return stemList
if __name__ == '__main__':
ldaliwc = LdaLiwc()
ldaliwc.runLDA()
print ldaliwc.ldamodel.print_topics(num_topics=ldaliwc.topicNum, num_words=20)
ldaliwc.outputLDAfeature()