Added scripts used to process corpus

josecannete · May 21, 2019 · 2dc650f · 2dc650f
1 parent c24a815
commit 2dc650f
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 0 deletions.
diff --git a/corpus_processing.py b/corpus_processing.py
@@ -0,0 +1,38 @@
+import re
+import sys
+
+
+#URLS_RE = re.compile(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b')
+URLS_RE = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
+
+LISTING_RE = re.compile(r'^(|[a-z]?|[0-9]{0,3})(\-|\.)+( |\n)')
+
+def remove_urls(text):
+	return URLS_RE.sub('', text)
+
+def replace_multi_whitespaces(line):
+	return ' '.join(line.split())
+
+def remove_listing(line):
+	return LISTING_RE.sub('', line)
+
+def main():	
+
+	with open(sys.argv[1], "r") as input_file:
+
+		for line in input_file:
+			if line is '\n':
+				print('')
+			else:
+				line = line.lower()
+				line = remove_urls(line)
+				line = remove_listing(line)
+				line = replace_multi_whitespaces(line)
+
+				if line is not '':
+					print(line)
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/split_punctuation.py b/split_punctuation.py
@@ -0,0 +1,60 @@
+import unicodedata
+import sys
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
+
+def _run_split_on_punc(text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ' '.join(["".join(x) for x in output])
+
+def replace_multi_whitespaces(line):
+	return ' '.join(line.split())
+
+def main():	
+
+	with open(sys.argv[1], "r") as input_file:
+		#if len(sys.argv) > 2:
+		#	with output_file = open(sys.argv[2], "w") as output_file:
+		for line in input_file:
+
+			if line is '\n':
+				print('')
+			else:
+				line = _run_split_on_punc(line)
+				line = replace_multi_whitespaces(line)
+
+				if line is not '':
+					print(line)
+
+
+if __name__ == '__main__':
+    main()