Merge pull request #3704 from webcompat/issue/3703/1

Fixes #3703 - Automatically label nsfw issues
webcompat · Jun 15, 2022 · bcac122 · bcac122
2 parents cb41dae + 02c1746
commit bcac122
Show file tree

Hide file tree

Showing 9 changed files with 273 additions and 33 deletions.
diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py
@@ -38,6 +38,7 @@
 from webcompat.helpers import get_extra_labels
 from webcompat.helpers import get_filename_from_url
 from webcompat.helpers import is_darknet_domain
+from webcompat.helpers import get_domains
 
 
 ACCESS_TOKEN_LINK = '<https://api.github.com/repositories/17839063/issues?per_page=50&page=3&access_token=12345>; rel="next", <https://api.github.com/repositories/17839063/issues?access_token=12345&per_page=50&page=4>; rel="last", <https://api.github.com/repositories/17839063/issues?per_page=50&access_token=12345&page=1>; rel="first", <https://api.github.com/repositories/17839063/issues?per_page=50&page=1&access_token=12345>; rel="prev"'  # noqa
@@ -602,6 +603,20 @@ def test_get_response_headers_response(self):
                                                 mime_type='text/html')
             assert new_headers2.get('content-type') == 'text/html'
 
+    def test_get_domains(self):
+        """Extract list of subdomains."""
+        self.assertEqual(
+            get_domains('www.example.com'), ['example.com']
+        )
+        self.assertEqual(
+            get_domains('sub.example.com'), ['example.com']
+        )
+        self.assertEqual(
+            get_domains('part.sub.example.com'),
+            ['sub.example.com', 'example.com']
+        )
+        self.assertEqual(get_domains('test'), [])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit/test_nsfw.py b/tests/unit/test_nsfw.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Tests for Siterank class."""
+
+import unittest
+import webcompat
+from webcompat.nsfw import moderate_screenshot
+
+
+class TestNSFW(unittest.TestCase):
+    """Tests for Top Sites Alexa class."""
+
+    def setUp(self):
+        """Set up the tests."""
+        webcompat.app.config['TESTING'] = True
+
+        self.issue_body_with_screenshot = """
+        <!-- @browser: Firefox 94.0 -->
+        <!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
+        <!-- @reported_with: desktop-reporter -->
+        <!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
+        <!-- @extra_labels: type-webrender-enabled -->
+
+        **URL**: http://aturemlaguerra.org/
+
+        **Browser / Version**: Firefox 94.0
+        **Operating System**: Mac OS X 10.15
+        **Tested Another Browser**: Yes Chrome
+
+        **Problem type**: Site is not usable
+        **Description**: Buttons or links not working
+        **Steps to Reproduce**:
+        gawrhs agrhsrthse sethsrthserhserhser
+        <details>
+              <summary>View the screenshot</summary>
+              <img alt="Screenshot" src="https://staging.webcompat.com/uploads/2021/9/31053f87-a71a-4241-8b47-b2c388545d14.jpeg">
+        </details>
+        """  # noqa
+
+        self.issue_body_moderated = """
+        <!-- @browser: Firefox 94.0 -->
+        <!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
+        <!-- @reported_with: desktop-reporter -->
+        <!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
+        <!-- @extra_labels: type-webrender-enabled -->
+
+        **URL**: http://aturemlaguerra.org/
+
+        **Browser / Version**: Firefox 94.0
+        **Operating System**: Mac OS X 10.15
+        **Tested Another Browser**: Yes Chrome
+
+        **Problem type**: Site is not usable
+        **Description**: Buttons or links not working
+        **Steps to Reproduce**:
+        gawrhs agrhsrthse sethsrthserhserhser
+        <details>
+              <summary>View the screenshot</summary>
+              Screenshot removed - possible explicit content.
+        </details>
+        """  # noqa
+
+        self.issue_body_no_screenshot = """
+        <!-- @browser: Firefox 94.0 -->
+        <!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
+        <!-- @reported_with: desktop-reporter -->
+        <!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
+        <!-- @extra_labels: type-webrender-enabled -->
+
+        **URL**: http://aturemlaguerra.org/
+
+        **Browser / Version**: Firefox 94.0
+        **Operating System**: Mac OS X 10.15
+        **Tested Another Browser**: Yes Chrome
+
+        **Problem type**: Site is not usable
+        **Description**: Buttons or links not working
+        **Steps to Reproduce**:
+        gawrhs agrhsrthse sethsrthserhserhser
+        """  # noqa
+
+    def tearDown(self):
+        """Tear down the tests."""
+        pass
+
+    def test_moderate_screenshot(self):
+        """Moderate screenshot and remove image tag."""
+        body = moderate_screenshot(self.issue_body_with_screenshot)
+        self.assertEqual(body, self.issue_body_moderated)
+
+    def test_no_screenshot_unchanged(self):
+        """Body remains unchanged if there is no screenshot."""
+        body = moderate_screenshot(self.issue_body_no_screenshot)
+        self.assertEqual(body, self.issue_body_no_screenshot)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/unit/test_webhook.py b/tests/unit/test_webhook.py
@@ -17,7 +17,7 @@
 
 import webcompat
 
-from webcompat.db import SiteGlobal, SiteRegional
+from webcompat.db import SiteGlobal, SiteRegional, SiteNSFW
 from webcompat.helpers import to_bytes
 from webcompat.webhooks import helpers, ml
 
@@ -160,6 +160,21 @@ def setUp(self):
         <!-- @browser: Firefox Mobile (Tablet) 40.0 -->
         """
 
+        self.issue_body13 = """
+        <!-- @browser: Firefox 101.0 -->
+        <!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:101.0) Gecko/20100101 Firefox/101.0 -->
+        <!-- @reported_with: desktop-reporter -->
+        <!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2780 -->
+        <!-- @extra_labels: type-webrender-enabled -->
+
+        **URL**: http://pornhub.com/
+        """  # noqa
+
+        self.issue_body14 = """
+        **URL**: http://test.pornhub.com/
+        <!-- @browser: Firefox Mobile (Tablet) 40.0 -->
+        """
+
         self.issue_info1 = {
             'action': 'foobar',
             'state': 'open',
@@ -402,29 +417,14 @@ def test_priority_label_not_found(self, mock_func):
         assert mock_func.call_count == 2
         self.assertEqual(priority_label, None)
 
-    def test_get_domains(self):
-        """Extract list of subdomains."""
-        self.assertEqual(
-            helpers.get_domains('www.example.com'), ['example.com']
-        )
-        self.assertEqual(
-            helpers.get_domains('sub.example.com'), ['example.com']
-        )
-        self.assertEqual(
-            helpers.get_domains('part.sub.example.com'),
-            ['sub.example.com', 'example.com']
-        )
-        self.assertEqual(
-            helpers.get_domains('test'),
-            []
-        )
-
+    @patch('webcompat.db.site_nsfw_db.query')
     @patch('webcompat.db.regional_site_db.query')
     @patch('webcompat.db.global_site_db.query')
-    def test_get_issue_labels(self, gm, rm):
+    def test_get_issue_labels(self, gm, rm, nm):
         """Extract list of labels from an issue body."""
         rm.return_value.filter_by.return_value.first.return_value = None
         gm.return_value.filter_by.return_value.first.return_value = None
+        nm.return_value.filter_by.return_value.first.return_value = None
 
         labels_tests = [
             (self.issue_body, ['browser-firefox', 'type-media',
@@ -445,6 +445,34 @@ def test_get_issue_labels(self, gm, rm):
             actual = helpers.get_issue_labels(issue_body)
             self.assertEqual(sorted(expected), sorted(actual))
 
+    @patch('webcompat.db.site_nsfw_db.query')
+    def test_nsfw_label(self, nsfw_mock):
+        """Extract nsfw label."""
+        nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW(         # noqa
+            'pornhub.com'
+        )
+
+        nsfw_label = helpers.extract_nsfw_label(self.issue_body13)
+        self.assertEqual(nsfw_label, 'nsfw')
+
+    @patch('webcompat.db.site_nsfw_db.query')
+    def test_nsfw_label_subdomain(self, nsfw_mock):
+        """Extract nsfw for a subdomain."""
+        nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW(         # noqa
+            'pornhub.com'
+        )
+
+        nsfw_label = helpers.extract_nsfw_label(self.issue_body14)
+        self.assertEqual(nsfw_label, 'nsfw')
+
+    @patch('webcompat.db.site_nsfw_db.query')
+    def test_nsfw_label_empty(self, nsfw_mock):
+        """Site is not found in nsfw db"""
+        nsfw_mock.return_value.filter_by.return_value.first.return_value = None
+
+        nsfw_label = helpers.extract_nsfw_label(self.issue_body13)
+        self.assertEqual(nsfw_label, None)
+
     def test_is_github_hook_missing_x_github_event(self):
         """Validation tests for GitHub Webhooks: Missing X-GitHub-Event."""
         json_event, signature = event_data('new_event_invalid.json')

diff --git a/webcompat/db/__init__.py b/webcompat/db/__init__.py
@@ -40,6 +40,12 @@
                                   autoflush=False,
                                   bind=regional_site_engine))
 
+site_nsfw_engine = create_engine('sqlite:///' + os.path.join(
+    app.config['DATA_PATH'], 'site-nsfw.db'))
+site_nsfw_db = scoped_session(sessionmaker(autocommit=False,
+                                           autoflush=False,
+                                           bind=site_nsfw_engine))
+
 
 UsersBase = declarative_base()
 UsersBase.query = session_db.query_property()
@@ -111,3 +117,22 @@ def __init__(self, url, priority, country_code, ranking):
 
 
 RegionalSiteBase.metadata.create_all(bind=regional_site_engine)
+
+
+SiteNSFWBase = declarative_base()
+SiteNSFWBase.query = site_nsfw_db.query_property()
+
+
+class SiteNSFW(SiteNSFWBase):
+    """Define the DB for NSFW domains."""
+
+    __tablename__ = 'site-nsfw'
+
+    url = Column(String, primary_key=True)
+
+    def __init__(self, url):
+        """Initialize parameters of the NSFW domains DB."""
+        self.url = url
+
+
+SiteNSFWBase.metadata.create_all(bind=site_nsfw_engine)
diff --git a/webcompat/form.py b/webcompat/form.py
@@ -33,6 +33,8 @@
 from webcompat.helpers import get_os
 from webcompat.helpers import get_details_list
 from webcompat.helpers import is_json_object
+from webcompat.nsfw import get_nsfw_label
+from webcompat.nsfw import moderate_screenshot
 
 SCHEMES = ('http://', 'https://')
 BAD_SCHEMES = ('http:/', 'https:/', 'http:', 'https:')
@@ -537,11 +539,11 @@ def build_formdata(form_object):
     details = form_object.get('details')
     if details:
         body += build_details(details)
+
+    if get_nsfw_label(domain):
+        body = moderate_screenshot(body)
+
     body += get_console_logs_url(form_object.get('console_logs_url'))
-    # Add the image, if there was one.
-    if form_object.get('image_upload') is not None:
-        body += '\n\n![Screenshot of the site issue]({image_url})'.format(
-            image_url=form_object.get('image_upload').get('url'))
     # Append "from webcompat.com" message to bottom (for GitHub issue viewers)
     body += '\n\n{0}'.format(GITHUB_HELP)
     rv = {'title': summary, 'body': body}

diff --git a/webcompat/helpers.py b/webcompat/helpers.py
@@ -785,6 +785,15 @@ def get_filename_from_url(uri):
     return script_path
 
 
+def get_domains(hostname):
+    """Extract subdomains"""
+    subparts = hostname.split('.')
+    domains = ['.'.join(subparts[i:])
+               for i, subpart in enumerate(subparts)
+               if 0 < i < hostname.count('.')]
+    return domains
+
+
 @app.context_processor
 def register_get_filename_from_url():
     return dict(get_filename_from_url=get_filename_from_url)
diff --git a/webcompat/nsfw.py b/webcompat/nsfw.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""Helpers methods for nsfw detection."""
+
+import re
+
+from webcompat.db import SiteNSFW
+from webcompat.db import site_nsfw_db
+from webcompat.helpers import get_domains
+
+NSFW_LABEL = 'nsfw'
+
+
+def is_url_nsfw(url):
+    return site_nsfw_db.query(SiteNSFW).filter_by(url=url).first()
+
+
+def get_nsfw_label(hostname):
+    """Query the nsfw DB for hostname or domain."""
+    if hostname:
+        if is_url_nsfw(hostname):
+            return NSFW_LABEL
+
+        # If hostname not found, try less-level domain (>2)
+        # If hostname is lv4.lv3.example.com, find lv3.example.com/example.com
+        domains = get_domains(hostname)
+        for domain in domains:
+            if is_url_nsfw(domain):
+                return NSFW_LABEL
+
+    return None
+
+
+def moderate_screenshot(body):
+    pattern = re.compile(r'<img alt="Screenshot" [^>]*src="([^"]+)"[^>]*>')
+    clean_body = re.sub(
+        pattern,
+        'Screenshot removed - possible explicit content.',
+        body
+    )
+    return clean_body