-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added scripts used to process corpus
- Loading branch information
1 parent
c24a815
commit 2dc650f
Showing
2 changed files
with
98 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import re | ||
import sys | ||
|
||
|
||
#URLS_RE = re.compile(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b') | ||
URLS_RE = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*') | ||
|
||
LISTING_RE = re.compile(r'^(|[a-z]?|[0-9]{0,3})(\-|\.)+( |\n)') | ||
|
||
def remove_urls(text): | ||
return URLS_RE.sub('', text) | ||
|
||
def replace_multi_whitespaces(line): | ||
return ' '.join(line.split()) | ||
|
||
def remove_listing(line): | ||
return LISTING_RE.sub('', line) | ||
|
||
def main(): | ||
|
||
with open(sys.argv[1], "r") as input_file: | ||
|
||
for line in input_file: | ||
if line is '\n': | ||
print('') | ||
else: | ||
line = line.lower() | ||
line = remove_urls(line) | ||
line = remove_listing(line) | ||
line = replace_multi_whitespaces(line) | ||
|
||
if line is not '': | ||
print(line) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import unicodedata | ||
import sys | ||
|
||
def _is_punctuation(char): | ||
"""Checks whether `chars` is a punctuation character.""" | ||
cp = ord(char) | ||
# We treat all non-letter/number ASCII as punctuation. | ||
# Characters such as "^", "$", and "`" are not in the Unicode | ||
# Punctuation class but we treat them as punctuation anyways, for | ||
# consistency. | ||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or | ||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | ||
return True | ||
cat = unicodedata.category(char) | ||
if cat.startswith("P"): | ||
return True | ||
return False | ||
|
||
def _run_split_on_punc(text): | ||
"""Splits punctuation on a piece of text.""" | ||
chars = list(text) | ||
i = 0 | ||
start_new_word = True | ||
output = [] | ||
while i < len(chars): | ||
char = chars[i] | ||
if _is_punctuation(char): | ||
output.append([char]) | ||
start_new_word = True | ||
else: | ||
if start_new_word: | ||
output.append([]) | ||
start_new_word = False | ||
output[-1].append(char) | ||
i += 1 | ||
|
||
return ' '.join(["".join(x) for x in output]) | ||
|
||
def replace_multi_whitespaces(line): | ||
return ' '.join(line.split()) | ||
|
||
def main(): | ||
|
||
with open(sys.argv[1], "r") as input_file: | ||
#if len(sys.argv) > 2: | ||
# with output_file = open(sys.argv[2], "w") as output_file: | ||
for line in input_file: | ||
|
||
if line is '\n': | ||
print('') | ||
else: | ||
line = _run_split_on_punc(line) | ||
line = replace_multi_whitespaces(line) | ||
|
||
if line is not '': | ||
print(line) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |