-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument.py
182 lines (147 loc) · 5.12 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import datetime
from abc import ABC, abstractmethod
from django.template.defaultfilters import filesizeformat
import utc
class Document(ABC):
def get_term_frequency(self, term: str) -> float:
return self.get_occurrences(term) / self.get_num_words()
@abstractmethod
def get_hash(self) -> str:
"""
Returns the hash code of this document.
:return: Documents hash
"""
pass
@abstractmethod
def get_keywords(self):
"""
Returns a map of all the keywords in this document and their occurrences.
:return: Map of keywords to occurrences
"""
pass
def get_occurrences(self, keyword: str) -> int:
"""
Returns the number of times a given keyword occurs in this document.
:param keyword: keyword in question
:return: Occurrences of the keyword
"""
keys = self.get_keywords()
if keyword in keys:
return keys[keyword]
return 0
def occurs(self, keyword: str):
"""
Returns True if the given keyword exists in this document.
:param keyword: The keyword in question
:return: True if the keyword exists, False otherwise
"""
return self.get_occurrences(keyword) > 0
@abstractmethod
def get_tags(self):
pass
def has_tag(self, tag: str) -> bool:
return tag in self.get_tags()
@abstractmethod
def get_parse_date(self) -> datetime:
"""
Returns the time when this document was parsed last.
:return: Last parse time
"""
pass
@abstractmethod
def get_create_date(self) -> datetime:
"""
Returns the time when this document was created.
:return: Last parse time
"""
pass
@abstractmethod
def get_edit_date(self) -> datetime:
"""
Returns the time when this document was edited last.
:return: Last parse time
"""
pass
@abstractmethod
def get_file_path(self) -> str:
"""
Returns the path of the file for this document.
:return: path of this document
"""
pass
@abstractmethod
def get_file_size(self) -> int:
"""
Returns the size of the file for this document.
:return: size of this document
"""
pass
def get_pretty_file_size(self) -> str:
return filesizeformat(self.get_file_size())
@abstractmethod
def get_num_words(self) -> int:
"""
Returns the number of words in the document.
:return: number of words in document
"""
pass
def is_same(self, doc) -> bool:
"""
Returns True whether the given document is a duplicate of this one.
:param doc: other document to compare
:return: True if they have the same hash code, False otherwise
"""
return self.get_hash() == doc.get_hash()
def __eq__(self, other):
if isinstance(other, Document):
return self.get_hash() == other.get_hash() \
and self.get_keywords() == other.get_keywords() \
and self.get_parse_date() == other.get_parse_date() \
and self.get_file_path() == other.get_file_path() \
and self.get_create_date() == other.get_create_date() \
and self.get_edit_date() == other.get_edit_date() \
and self.get_file_size() == other.get_file_size() \
and self.get_num_words() == other.get_num_words()
return False
class SimpleDocument(Document):
def get_hash(self):
return self._hash
def get_keywords(self):
return self._keywords
def get_parse_date(self):
return self._parse_date
def get_create_date(self):
return self._create_date
def get_edit_date(self):
return self._edit_date
def get_file_path(self):
return self._file_path
def get_file_size(self):
return self._file_size
def get_num_words(self):
return self._num_words
def get_tags(self):
return {}
'''This method uses ctime to find creation time (Windows), or last metadata change (Unix).
Second datetime in tuple will be the last modified datetime'''
@staticmethod
def find_create_and_mod(file_path) -> (datetime, datetime):
# check whether windows, linux or mac
stat = os.stat(file_path)
create_date = datetime.datetime.fromtimestamp(stat.st_ctime)
mod_date = datetime.datetime.fromtimestamp(stat.st_mtime)
return create_date, mod_date
def __init__(self, hash_val: str, keywords, file_path: str, create_date: datetime, edit_date: datetime, file_size:int, num_words:int, parse_date: datetime = utc.now()):
self._hash = hash_val
self._keywords = keywords
self._file_path = file_path
self._parse_date = parse_date
self._create_date = create_date
self._edit_date = edit_date
self._file_size = file_size
self._num_words = num_words
def __str__(self):
return self.get_file_path()
def __repr__(self):
return str(self)