From 02c1746c9d1689218c44958a30cf4af5802a2146 Mon Sep 17 00:00:00 2001 From: Ksenia Berezina Date: Sun, 12 Jun 2022 19:01:42 -0400 Subject: [PATCH] Fixes #3703 - Automatically label nsfw issues --- tests/unit/test_helpers.py | 15 +++++ tests/unit/test_nsfw.py | 102 ++++++++++++++++++++++++++++++++++ tests/unit/test_webhook.py | 66 +++++++++++++++------- webcompat/db/__init__.py | 25 +++++++++ webcompat/form.py | 10 ++-- webcompat/helpers.py | 9 +++ webcompat/nsfw.py | 44 +++++++++++++++ webcompat/webhooks/helpers.py | 20 +++---- webcompat/webhooks/model.py | 15 +++++ 9 files changed, 273 insertions(+), 33 deletions(-) create mode 100644 tests/unit/test_nsfw.py create mode 100644 webcompat/nsfw.py diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py index 16dea4f93..e28e54408 100644 --- a/tests/unit/test_helpers.py +++ b/tests/unit/test_helpers.py @@ -38,6 +38,7 @@ from webcompat.helpers import get_extra_labels from webcompat.helpers import get_filename_from_url from webcompat.helpers import is_darknet_domain +from webcompat.helpers import get_domains ACCESS_TOKEN_LINK = '; rel="next", ; rel="last", ; rel="first", ; rel="prev"' # noqa @@ -602,6 +603,20 @@ def test_get_response_headers_response(self): mime_type='text/html') assert new_headers2.get('content-type') == 'text/html' + def test_get_domains(self): + """Extract list of subdomains.""" + self.assertEqual( + get_domains('www.example.com'), ['example.com'] + ) + self.assertEqual( + get_domains('sub.example.com'), ['example.com'] + ) + self.assertEqual( + get_domains('part.sub.example.com'), + ['sub.example.com', 'example.com'] + ) + self.assertEqual(get_domains('test'), []) + if __name__ == '__main__': unittest.main() diff --git a/tests/unit/test_nsfw.py b/tests/unit/test_nsfw.py new file mode 100644 index 000000000..cdfdcd612 --- /dev/null +++ b/tests/unit/test_nsfw.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Tests for Siterank class.""" + +import unittest +import webcompat +from webcompat.nsfw import moderate_screenshot + + +class TestNSFW(unittest.TestCase): + """Tests for Top Sites Alexa class.""" + + def setUp(self): + """Set up the tests.""" + webcompat.app.config['TESTING'] = True + + self.issue_body_with_screenshot = """ + + + + + + + **URL**: http://aturemlaguerra.org/ + + **Browser / Version**: Firefox 94.0 + **Operating System**: Mac OS X 10.15 + **Tested Another Browser**: Yes Chrome + + **Problem type**: Site is not usable + **Description**: Buttons or links not working + **Steps to Reproduce**: + gawrhs agrhsrthse sethsrthserhserhser +
+ View the screenshot + Screenshot +
+ """ # noqa + + self.issue_body_moderated = """ + + + + + + + **URL**: http://aturemlaguerra.org/ + + **Browser / Version**: Firefox 94.0 + **Operating System**: Mac OS X 10.15 + **Tested Another Browser**: Yes Chrome + + **Problem type**: Site is not usable + **Description**: Buttons or links not working + **Steps to Reproduce**: + gawrhs agrhsrthse sethsrthserhserhser +
+ View the screenshot + Screenshot removed - possible explicit content. +
+ """ # noqa + + self.issue_body_no_screenshot = """ + + + + + + + **URL**: http://aturemlaguerra.org/ + + **Browser / Version**: Firefox 94.0 + **Operating System**: Mac OS X 10.15 + **Tested Another Browser**: Yes Chrome + + **Problem type**: Site is not usable + **Description**: Buttons or links not working + **Steps to Reproduce**: + gawrhs agrhsrthse sethsrthserhserhser + """ # noqa + + def tearDown(self): + """Tear down the tests.""" + pass + + def test_moderate_screenshot(self): + """Moderate screenshot and remove image tag.""" + body = moderate_screenshot(self.issue_body_with_screenshot) + self.assertEqual(body, self.issue_body_moderated) + + def test_no_screenshot_unchanged(self): + """Body remains unchanged if there is no screenshot.""" + body = moderate_screenshot(self.issue_body_no_screenshot) + self.assertEqual(body, self.issue_body_no_screenshot) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/unit/test_webhook.py b/tests/unit/test_webhook.py index 9d80978c8..d73e28575 100644 --- a/tests/unit/test_webhook.py +++ b/tests/unit/test_webhook.py @@ -17,7 +17,7 @@ import webcompat -from webcompat.db import SiteGlobal, SiteRegional +from webcompat.db import SiteGlobal, SiteRegional, SiteNSFW from webcompat.helpers import to_bytes from webcompat.webhooks import helpers, ml @@ -160,6 +160,21 @@ def setUp(self): """ + self.issue_body13 = """ + + + + + + + **URL**: http://pornhub.com/ + """ # noqa + + self.issue_body14 = """ + **URL**: http://test.pornhub.com/ + + """ + self.issue_info1 = { 'action': 'foobar', 'state': 'open', @@ -402,29 +417,14 @@ def test_priority_label_not_found(self, mock_func): assert mock_func.call_count == 2 self.assertEqual(priority_label, None) - def test_get_domains(self): - """Extract list of subdomains.""" - self.assertEqual( - helpers.get_domains('www.example.com'), ['example.com'] - ) - self.assertEqual( - helpers.get_domains('sub.example.com'), ['example.com'] - ) - self.assertEqual( - helpers.get_domains('part.sub.example.com'), - ['sub.example.com', 'example.com'] - ) - self.assertEqual( - helpers.get_domains('test'), - [] - ) - + @patch('webcompat.db.site_nsfw_db.query') @patch('webcompat.db.regional_site_db.query') @patch('webcompat.db.global_site_db.query') - def test_get_issue_labels(self, gm, rm): + def test_get_issue_labels(self, gm, rm, nm): """Extract list of labels from an issue body.""" rm.return_value.filter_by.return_value.first.return_value = None gm.return_value.filter_by.return_value.first.return_value = None + nm.return_value.filter_by.return_value.first.return_value = None labels_tests = [ (self.issue_body, ['browser-firefox', 'type-media', @@ -445,6 +445,34 @@ def test_get_issue_labels(self, gm, rm): actual = helpers.get_issue_labels(issue_body) self.assertEqual(sorted(expected), sorted(actual)) + @patch('webcompat.db.site_nsfw_db.query') + def test_nsfw_label(self, nsfw_mock): + """Extract nsfw label.""" + nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW( # noqa + 'pornhub.com' + ) + + nsfw_label = helpers.extract_nsfw_label(self.issue_body13) + self.assertEqual(nsfw_label, 'nsfw') + + @patch('webcompat.db.site_nsfw_db.query') + def test_nsfw_label_subdomain(self, nsfw_mock): + """Extract nsfw for a subdomain.""" + nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW( # noqa + 'pornhub.com' + ) + + nsfw_label = helpers.extract_nsfw_label(self.issue_body14) + self.assertEqual(nsfw_label, 'nsfw') + + @patch('webcompat.db.site_nsfw_db.query') + def test_nsfw_label_empty(self, nsfw_mock): + """Site is not found in nsfw db""" + nsfw_mock.return_value.filter_by.return_value.first.return_value = None + + nsfw_label = helpers.extract_nsfw_label(self.issue_body13) + self.assertEqual(nsfw_label, None) + def test_is_github_hook_missing_x_github_event(self): """Validation tests for GitHub Webhooks: Missing X-GitHub-Event.""" json_event, signature = event_data('new_event_invalid.json') diff --git a/webcompat/db/__init__.py b/webcompat/db/__init__.py index c32b9e000..893294807 100644 --- a/webcompat/db/__init__.py +++ b/webcompat/db/__init__.py @@ -40,6 +40,12 @@ autoflush=False, bind=regional_site_engine)) +site_nsfw_engine = create_engine('sqlite:///' + os.path.join( + app.config['DATA_PATH'], 'site-nsfw.db')) +site_nsfw_db = scoped_session(sessionmaker(autocommit=False, + autoflush=False, + bind=site_nsfw_engine)) + UsersBase = declarative_base() UsersBase.query = session_db.query_property() @@ -111,3 +117,22 @@ def __init__(self, url, priority, country_code, ranking): RegionalSiteBase.metadata.create_all(bind=regional_site_engine) + + +SiteNSFWBase = declarative_base() +SiteNSFWBase.query = site_nsfw_db.query_property() + + +class SiteNSFW(SiteNSFWBase): + """Define the DB for NSFW domains.""" + + __tablename__ = 'site-nsfw' + + url = Column(String, primary_key=True) + + def __init__(self, url): + """Initialize parameters of the NSFW domains DB.""" + self.url = url + + +SiteNSFWBase.metadata.create_all(bind=site_nsfw_engine) diff --git a/webcompat/form.py b/webcompat/form.py index 962ee144b..10d3d103a 100644 --- a/webcompat/form.py +++ b/webcompat/form.py @@ -33,6 +33,8 @@ from webcompat.helpers import get_os from webcompat.helpers import get_details_list from webcompat.helpers import is_json_object +from webcompat.nsfw import get_nsfw_label +from webcompat.nsfw import moderate_screenshot SCHEMES = ('http://', 'https://') BAD_SCHEMES = ('http:/', 'https:/', 'http:', 'https:') @@ -537,11 +539,11 @@ def build_formdata(form_object): details = form_object.get('details') if details: body += build_details(details) + + if get_nsfw_label(domain): + body = moderate_screenshot(body) + body += get_console_logs_url(form_object.get('console_logs_url')) - # Add the image, if there was one. - if form_object.get('image_upload') is not None: - body += '\n\n![Screenshot of the site issue]({image_url})'.format( - image_url=form_object.get('image_upload').get('url')) # Append "from webcompat.com" message to bottom (for GitHub issue viewers) body += '\n\n{0}'.format(GITHUB_HELP) rv = {'title': summary, 'body': body} diff --git a/webcompat/helpers.py b/webcompat/helpers.py index 37636b421..ffa40911b 100644 --- a/webcompat/helpers.py +++ b/webcompat/helpers.py @@ -785,6 +785,15 @@ def get_filename_from_url(uri): return script_path +def get_domains(hostname): + """Extract subdomains""" + subparts = hostname.split('.') + domains = ['.'.join(subparts[i:]) + for i, subpart in enumerate(subparts) + if 0 < i < hostname.count('.')] + return domains + + @app.context_processor def register_get_filename_from_url(): return dict(get_filename_from_url=get_filename_from_url) diff --git a/webcompat/nsfw.py b/webcompat/nsfw.py new file mode 100644 index 000000000..53111e80b --- /dev/null +++ b/webcompat/nsfw.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Helpers methods for nsfw detection.""" + +import re + +from webcompat.db import SiteNSFW +from webcompat.db import site_nsfw_db +from webcompat.helpers import get_domains + +NSFW_LABEL = 'nsfw' + + +def is_url_nsfw(url): + return site_nsfw_db.query(SiteNSFW).filter_by(url=url).first() + + +def get_nsfw_label(hostname): + """Query the nsfw DB for hostname or domain.""" + if hostname: + if is_url_nsfw(hostname): + return NSFW_LABEL + + # If hostname not found, try less-level domain (>2) + # If hostname is lv4.lv3.example.com, find lv3.example.com/example.com + domains = get_domains(hostname) + for domain in domains: + if is_url_nsfw(domain): + return NSFW_LABEL + + return None + + +def moderate_screenshot(body): + pattern = re.compile(r'Screenshot]*src="([^"]+)"[^>]*>') + clean_body = re.sub( + pattern, + 'Screenshot removed - possible explicit content.', + body + ) + return clean_body diff --git a/webcompat/webhooks/helpers.py b/webcompat/webhooks/helpers.py index 8a9769a7d..686803623 100644 --- a/webcompat/webhooks/helpers.py +++ b/webcompat/webhooks/helpers.py @@ -20,8 +20,10 @@ from webcompat.helpers import extract_url from webcompat.helpers import proxy_request from webcompat.helpers import to_bytes +from webcompat.helpers import get_domains from webcompat.issues import moderation_template from webcompat.form import wrap_metadata +from webcompat.nsfw import get_nsfw_label BROWSERS = ['blackberry', 'brave', 'chrome', 'edge', 'firefox', 'iceweasel', 'ie', 'lynx', 'myie', 'opera', 'puffin', 'qq', 'safari', 'samsung', 'seamonkey', 'uc', 'vivaldi'] # noqa GECKO_BROWSERS = ['browser-android-components', @@ -117,15 +119,6 @@ def priority_label_by_url(url): return None -def get_domains(hostname): - """Extract subdomains""" - subparts = hostname.split('.') - domains = ['.'.join(subparts[i:]) - for i, subpart in enumerate(subparts) - if 0 < i < hostname.count('.')] - return domains - - def extract_priority_label(body): """Parse url from body and query the priority labels.""" hostname = domain_name(extract_url(body)) @@ -144,6 +137,12 @@ def extract_priority_label(body): return label +def extract_nsfw_label(body): + """Parse url from body and query the nsfw DB.""" + hostname = domain_name(extract_url(body)) + return get_nsfw_label(hostname) + + def signature_check(key, post_signature, payload): """Check the HTTP POST legitimacy.""" if post_signature.startswith('sha1='): @@ -191,7 +190,8 @@ def get_issue_labels(issue_body): browser_label = None labelslist.extend(extra_labels) priority_label = extract_priority_label(issue_body) - labelslist.extend([browser_label, priority_label]) + nsfw_label = extract_nsfw_label(issue_body) + labelslist.extend([browser_label, priority_label, nsfw_label]) if any(label for label in labelslist if label in GECKO_BROWSERS): labelslist.append('engine-gecko') if any(label for label in labelslist if label in IOS_BROWSERS): diff --git a/webcompat/webhooks/model.py b/webcompat/webhooks/model.py index c0de1263b..3180d6f83 100644 --- a/webcompat/webhooks/model.py +++ b/webcompat/webhooks/model.py @@ -23,6 +23,7 @@ from webcompat.webhooks.helpers import prepare_rejected_issue from webcompat.webhooks.helpers import repo_scope from webcompat.webhooks.helpers import prepare_private_url +from webcompat.webhooks.helpers import extract_nsfw_label from webcompat.webhooks.ml import get_issue_classification from webcompat.issues import moderation_template @@ -298,6 +299,14 @@ def add_bugbug_tracking_label(self, label_name): path = f'repos/{PUBLIC_REPO}/{self.number}/labels' make_request('post', path, payload) + def label_nsfw(self): + nsfw_label = extract_nsfw_label(self.body) + + if nsfw_label: + payload = {'labels': [nsfw_label]} + path = f'repos/{PRIVATE_REPO}/{self.number}/labels' + make_request('post', path, payload) + def process_issue_action(self): """Route the actions and provide different responses. @@ -367,6 +376,12 @@ def process_issue_action(self): msg_log(f'comment failed ({e})', self.number) return oops() + try: + self.label_nsfw() + except HTTPError as e: + msg_log(f'labeling as nsfw failed ({e})', self.number) + return oops() + try: self.classify() except (HTTPError, ConnectionError) as e: