Skip to content

Commit

Permalink
purely summarizes now
Browse files Browse the repository at this point in the history
  • Loading branch information
dianzrong committed Jun 1, 2023
1 parent 6b8a564 commit f197253
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 129 deletions.
87 changes: 16 additions & 71 deletions puterbot/strategies/summary_mixin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Implements a ReplayStrategy mixin which identifies whether windows are different
from summarizing their contents.
Implements a ReplayStrategy mixin which summarizes the content of texts.
Usage:
Expand All @@ -15,10 +15,6 @@
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from nltk.corpus import wordnet
import re
from fuzzywuzzy import fuzz
import statistics


class SummaryReplayStrategyMixin(BaseReplayStrategy):
Expand All @@ -32,74 +28,23 @@ def __init__(
):
"""
See base class.
Additional Attributes:
- stemmer:
- summarizer:
"""
super().__init__(recording)
self.stemmer = Stemmer("english")
summarizer = LsaSummarizer(self.stemmer)
summarizer.stop_words = get_stop_words("english")
self.summarizer = summarizer

def get_summary(
self,
ascii_curr: str,
ascii_prev: str,
ocr_curr: str,
ocr_prev: str,
) -> float:
"""Returns how similar the contents of 2 screenshots are.
Args:
self: the SummaryReplayStrategyMixin
ascii_curr: the ascii text of the current screenshot
ascii_prev: the ascii text of the previous screenshot
ocr_curr: the ocr text of the current screenshot
ocr_prev: the ocr text of the previous screenshot
Returns:
A float which represents the mean of the following 2 similarity values:
1. how similar the summaries of the current and previous ASCII text are
2. how similar the summaries of the current and previous OCR text are
"""
cleaned_ascii_curr = clean_ascii(ascii_curr)
cleaned_ascii_prev = clean_ascii(ascii_prev)

ascii_similarity = compare_text(cleaned_ascii_curr, cleaned_ascii_prev)
ocr_similarity = compare_text(ocr_curr, ocr_prev)

similarity_list = [ascii_similarity, ocr_similarity]

# may want the required minimum similarity
return statistics.fmean(similarity_list)


def clean_ascii(
text: str,
) -> str:
"""
Returns a string of the words in the inputted text.
"""
# remove the irrelevant symbols in the ascii and returns a list of strings
no_symbols = re.sub(r"[^\w\s]+", " ", text)
ascii_words = []

for word in no_symbols.split():
if wordnet.synsets(word): # check if it's a word
ascii_words.append(word)
return " ".join(ascii_words)


def compare_text(
text1: str,
text2: str,
) -> int:
"""
Returns a int value between 0 and 100 (inclusive on both sides) representing how similar the 2 strings are.
"""
stemmer = Stemmer("english")
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words("english")

parser1 = PlaintextParser.from_string(text1, Tokenizer("english"))
summarized1 = summarizer(parser1.document, 1)

parser2 = PlaintextParser.from_string(text2, Tokenizer("english"))
summarized2 = summarizer(parser2.document, 1)
text: str,
num_sentences: int,
) -> str:
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarized = self.summarizer(parser.document, num_sentences)
return summarized

# may want to change to something more complex
return fuzz.WRatio(summarized1, summarized2)
74 changes: 16 additions & 58 deletions tests/puterbot/test_summary_mixin.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,24 @@
"""
Tests the 2 helper functions (clean_ascii and compare_text) in summary_mixin.py
Tests the get_summary function in summary_mixin.py
"""
from puterbot.strategies.summary_mixin import clean_ascii, compare_text
from fuzzywuzzy import fuzz

from puterbot.strategies.demo import DemoReplayStrategy
from puterbot.models import Recording, Screenshot

################################################################
# Clean ASCII tests
################################################################
RECORDING = Recording()
REPLAY = DemoReplayStrategy(RECORDING)


def test_clean_ascii_empty():
def test_summary_empty():
empty_text = ""
expected = ""
actual = clean_ascii(empty_text)
assert actual == expected


def test_clean_ascii_no_symbols_or_stopwords():
no_symbols = "wow no symbols"
expected = "wow no symbols"
actual = clean_ascii(no_symbols)
assert actual == expected


def test_clean_ascii_some_symbols_and_stopwords():
many_symbols = "wow this! has some... symbols"
expected = "wow has some symbols"
actual = clean_ascii(many_symbols)
assert actual == expected


def test_clean_ascii_all_symbols_and_stopwords():
all_symbols = "&*@#($)#!| ~~ this \\"
expected = ""
actual = clean_ascii(all_symbols)
assert actual == expected


################################################################
# Compare text tests
################################################################


def test_compare_text_empty():
text1 = ""
text2 = ""
expected = 0
actual = compare_text(text1, text2)
assert actual == expected


def test_compare_text_similar():
text1 = "I love sunshine so much."
text2 = "I adore the sun."
expected = 50
actual = compare_text(text1, text2)
assert actual > expected
actual = REPLAY.get_summary(empty_text, 1)
assert len(actual) == 0


def test_compare_text_not_similar():
text1 = "I love sunshine so much"
text2 = "Once upon a time, there was a princess."
expected = 50
actual = compare_text(text1, text2)
assert actual < expected
def test_summary_sentence():
story = "However, this bottle was not marked “poison,” so Alice ventured to taste it, \
and finding it very nice, (it had, in fact, a sort of mixed flavour of cherry-tart, \
custard, pine-apple, roast turkey, toffee, and hot buttered toast,) \
she very soon finished it off."
actual = REPLAY.get_summary(story, 1)
assert fuzz.WRatio(actual, story) > 50

0 comments on commit f197253

Please sign in to comment.