Skip to content

Commit

Permalink
Merge pull request #15 from mozilla/issue4
Browse files Browse the repository at this point in the history
Export has client_id not user_id, so changed code (See #4)
  • Loading branch information
kdavis-mozilla authored Dec 13, 2018
2 parents d6cae0f + f408604 commit 4d4571e
Show file tree
Hide file tree
Showing 17 changed files with 37 additions and 39 deletions.
1 change: 1 addition & 0 deletions src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def create(self):
corpus_data = corpora_data.loc[
lambda df: df.locale == locale,
[
"client_id",
"path",
"sentence",
"up_votes",
Expand Down
15 changes: 6 additions & 9 deletions src/corporacreator/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,8 @@ def create(self):
_logger.debug("Created %s corpora." % self.locale)

def _pre_process_corpus_data(self):
self.corpus_data["user_id"] = self.corpus_data["path"].str.split(
"/", expand=True
)[0] # TODO: Remove this line when the Gregor modifies the csv output to include user_id
preprocessor = getattr(preprocessors, self.locale.replace("-","")) # Get locale specific preprocessor
self.corpus_data["sentence"] = self.corpus_data[["user_id", "sentence"]].apply(func=lambda arg: preprocessor(*arg), axis=1)
self.corpus_data["sentence"] = self.corpus_data[["client_id", "sentence"]].apply(func=lambda arg: preprocessor(*arg), axis=1)

def _partition_corpus_data(self):
self.other = self.corpus_data.loc[
Expand All @@ -60,13 +57,13 @@ def _partition_corpus_data(self):

def _post_process_valid_data(self):
# Remove duplicate sentences while maintaining maximal user diversity at the frame's start (TODO: Make addition of user_sentence_count cleaner)
speaker_counts = self.valid["user_id"].value_counts()
speaker_counts = self.valid["client_id"].value_counts()
speaker_counts = speaker_counts.to_frame().reset_index()
speaker_counts.columns = ["user_id", "user_sentence_count"]
self.valid = self.valid.join(speaker_counts.set_index("user_id"), on="user_id")
self.valid = self.valid.sort_values(["user_sentence_count", "user_id"])
speaker_counts.columns = ["client_id", "user_sentence_count"]
self.valid = self.valid.join(speaker_counts.set_index("client_id"), on="client_id")
self.valid = self.valid.sort_values(["user_sentence_count", "client_id"])
valid = self.valid.groupby("sentence").head(self.args.duplicate_sentence_count)
valid = valid.sort_values(["user_sentence_count", "user_id"], ascending=False)
valid = valid.sort_values(["user_sentence_count", "client_id"], ascending=False)
valid = valid.drop(columns="user_sentence_count")
self.valid = self.valid.drop(columns="user_sentence_count")
# Determine train, dev, and test sizes
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/br.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def br(user_id, sentence):
def br(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/ca.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def ca(user_id, sentence):
def ca(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/cv.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def cv(user_id, sentence):
def cv(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/cy.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def cy(user_id, sentence):
def cy(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/de.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def de(user_id, sentence):
def de(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/en.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def en(user_id, sentence):
def en(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/fr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def fr(user_id, sentence):
def fr(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/gaIE.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def gaIE(user_id, sentence):
def gaIE(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/it.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def it(user_id, sentence):
def it(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/kab.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def kab(user_id, sentence):
def kab(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/ky.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def ky(user_id, sentence):
def ky(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/sl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def sl(user_id, sentence):
def sl(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/tr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def tr(user_id, sentence):
def tr(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/tt.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def tt(user_id, sentence):
def tt(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/zhTW.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
def zhTW(user_id, sentence):
def zhTW(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
user_id (str): User ID of sentence's speaker
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.
Returns:
Expand Down

0 comments on commit 4d4571e

Please sign in to comment.