-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataParse.py
executable file
·112 lines (93 loc) · 3.79 KB
/
dataParse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
'''
Parse the Task A and Task B data from the SemEvel Competition
Usage parseB():
data[SID\tUID]['subject'] - gives the subjects associated with
the given ID
data[SID\tUID][<subject>]['polar'] - the tag of the subject
data[SID\tUID][<subject>]['tweet'] - the contents of the tweet
Usage parseA():
data[SID\tUID]['index'] - the parts of the phrase to be indexed
data[SID\tUID][<index>]['polar'] - the tag of the phrase
data[SID\tUID][<index>]['tweet'] - the contents of the tweet
'''
#-----------------------------------------------------------------------------#
PERCENTTRAIN = .8
PERCENTTEST = .2
from collections import defaultdict
#-----------------------------------------------------------------------------#
def getTotalLines(filename):
count = 0
for line in open(filename, 'r'):
count += 1
return count
#-----------------------------------------------------------------------------#
def parseB(filename,segment):
totalLines = getTotalLines(filename)
count = 1
data = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
test = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
startPartition = int(totalLines*((segment - 1) * PERCENTTEST))
endPartition = int(totalLines*((segment) * PERCENTTEST))
infile = open(filename, 'r')
for line in infile:
line = line.strip().split('\t')
if count >= startPartition and count <= endPartition:
dictID = line[0]+'\t'+line[1]
test[dictID][line[2]]['polar'] = line[3].strip('\"')
test[dictID][line[2]]['tweet'] = line[4]
else:
dictID = line[0]+'\t'+line[1]
data[dictID][line[2]]['polar'] = line[3].strip('\"')
data[dictID][line[2]]['tweet'] = line[4]
count += 1
infile.close()
return data, test
#-----------------------------------------------------------------------------#
def parseA(filename,segment):
totalLines = getTotalLines(filename)
count = 1
data = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
test = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
startPartition = int(totalLines*((segment - 1) * PERCENTTEST))
endPartition = int(totalLines*((segment) * PERCENTTEST))
infile = open(filename, 'r')
for line in infile:
line = line.strip().split('\t')
if count >= startPartition and count <= endPartition:
dictID = line[0]+'\t'+line[1]
index = (line[2],line[3])
test[dictID][index]['polar'] = line[4].strip('\"')
test[dictID][index]['tweet'] = line[5]
else:
dictID = line[0]+'\t'+line[1]
index = (line[2],line[3])
data[dictID][index]['polar'] = line[4].strip('\"')
data[dictID][index]['tweet'] = line[5]
count += 1
infile.close()
return data, test
#-----------------------------------------------------------------------------#
if __name__ == '__main__':
trainingFileA = 'trainA.txt'
trainingFileB = 'trainB.txt'
trainingDataA, testDataA = parseA(trainingFileA)
trainingDataB, testDataB = parseB(trainingFileB)
'''
for ID in testDataB.keys():
for subject in testDataB[ID].keys():
polar = testDataB[ID][subject]['polar']
tweet = testDataB[ID][subject]['tweet']
print '%s\t%s\t%s\t%s' %(ID,subject,polar,tweet)
for ID in trainingDataB.keys():
for subject in trainingDataB[ID].keys():
polar = trainingDataB[ID][subject]['polar']
tweet = trainingDataB[ID][subject]['tweet']
print '%s\t%s\t%s\t%s' %(ID,subject,polar,tweet)
for ID in trainingDataA.keys():
for index in trainingDataA[ID].keys():
polar = trainingDataA[ID][index]['polar']
tweet = trainingDataA[ID][index]['tweet']
print '%s\t%s\t%s\t%s' %(ID,index,polar,tweet)
'''
#-----------------------------------------------------------------------------#