From 68d0b728ce2fc424919b366e8fdd6bada1468342 Mon Sep 17 00:00:00 2001 From: Jenny Zhang Date: Tue, 16 Jun 2020 18:52:40 -0400 Subject: [PATCH 1/2] Update validation logic following email discussion w Kelly and Megan --- src/corporacreator/corpus.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/corporacreator/corpus.py b/src/corporacreator/corpus.py index 01beb02..2508569 100644 --- a/src/corporacreator/corpus.py +++ b/src/corporacreator/corpus.py @@ -58,17 +58,26 @@ def _preprocessor_wrapper(self, client_id, sentence, up_votes, down_votes): return pd.Series([sentence, up_votes, down_votes]) def _partition_corpus_data(self): + # If there are < 2 votes, or 2 opposing votes + # there is not enough information to make a determination self.other = self.corpus_data.loc[ - lambda df: (df.up_votes + df.down_votes) <= 1, : + lambda df: (df.up_votes + df.down_votes <= 1) + | ((df.up_votes == 1) & (df.down_votes == 1)), : ] + # If there are 2+ votes, and up_votes > down_votes, clip is valid self.validated = self.corpus_data.loc[ lambda df: (df.up_votes + df.down_votes > 1) & (df.up_votes > df.down_votes), :, ] + # If there are 2+ votes, and down_votes > up_votes, clip is invalid + # If there are 3+ votes, and up_votes == down_votes, opinions + # are diverging too much to be relied upon, and clip is invalid self.invalidated = self.corpus_data.loc[ lambda df: (df.up_votes + df.down_votes > 1) - & (df.up_votes <= df.down_votes), + & (df.up_votes < df.down_votes) + | ((df.up_votes == df.down_votes) + & (df.up_votes + df.down_votes > 2)), :, ] From e1e95ece3edd8c0a27521aedbbb345122fc66b6a Mon Sep 17 00:00:00 2001 From: Jenny Zhang Date: Wed, 17 Jun 2020 17:38:24 -0400 Subject: [PATCH 2/2] Slight clean-up for easier reading --- src/corporacreator/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/corporacreator/corpus.py b/src/corporacreator/corpus.py index 2508569..279855f 100644 --- a/src/corporacreator/corpus.py +++ b/src/corporacreator/corpus.py @@ -74,8 +74,8 @@ def _partition_corpus_data(self): # If there are 3+ votes, and up_votes == down_votes, opinions # are diverging too much to be relied upon, and clip is invalid self.invalidated = self.corpus_data.loc[ - lambda df: (df.up_votes + df.down_votes > 1) - & (df.up_votes < df.down_votes) + lambda df: ((df.up_votes + df.down_votes > 1) + & (df.up_votes < df.down_votes)) | ((df.up_votes == df.down_votes) & (df.up_votes + df.down_votes > 2)), :,