Skip to content

Commit

Permalink
Merge pull request #3704 from webcompat/issue/3703/1
Browse files Browse the repository at this point in the history
Fixes #3703 - Automatically label nsfw issues
  • Loading branch information
ksy36 authored Jun 15, 2022
2 parents cb41dae + 02c1746 commit bcac122
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 33 deletions.
15 changes: 15 additions & 0 deletions tests/unit/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from webcompat.helpers import get_extra_labels
from webcompat.helpers import get_filename_from_url
from webcompat.helpers import is_darknet_domain
from webcompat.helpers import get_domains


ACCESS_TOKEN_LINK = '<https://api.github.com/repositories/17839063/issues?per_page=50&page=3&access_token=12345>; rel="next", <https://api.github.com/repositories/17839063/issues?access_token=12345&per_page=50&page=4>; rel="last", <https://api.github.com/repositories/17839063/issues?per_page=50&access_token=12345&page=1>; rel="first", <https://api.github.com/repositories/17839063/issues?per_page=50&page=1&access_token=12345>; rel="prev"' # noqa
Expand Down Expand Up @@ -602,6 +603,20 @@ def test_get_response_headers_response(self):
mime_type='text/html')
assert new_headers2.get('content-type') == 'text/html'

def test_get_domains(self):
"""Extract list of subdomains."""
self.assertEqual(
get_domains('www.example.com'), ['example.com']
)
self.assertEqual(
get_domains('sub.example.com'), ['example.com']
)
self.assertEqual(
get_domains('part.sub.example.com'),
['sub.example.com', 'example.com']
)
self.assertEqual(get_domains('test'), [])


if __name__ == '__main__':
unittest.main()
102 changes: 102 additions & 0 deletions tests/unit/test_nsfw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""Tests for Siterank class."""

import unittest
import webcompat
from webcompat.nsfw import moderate_screenshot


class TestNSFW(unittest.TestCase):
"""Tests for Top Sites Alexa class."""

def setUp(self):
"""Set up the tests."""
webcompat.app.config['TESTING'] = True

self.issue_body_with_screenshot = """
<!-- @browser: Firefox 94.0 -->
<!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
<!-- @reported_with: desktop-reporter -->
<!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
<!-- @extra_labels: type-webrender-enabled -->
**URL**: http://aturemlaguerra.org/
**Browser / Version**: Firefox 94.0
**Operating System**: Mac OS X 10.15
**Tested Another Browser**: Yes Chrome
**Problem type**: Site is not usable
**Description**: Buttons or links not working
**Steps to Reproduce**:
gawrhs agrhsrthse sethsrthserhserhser
<details>
<summary>View the screenshot</summary>
<img alt="Screenshot" src="https://staging.webcompat.com/uploads/2021/9/31053f87-a71a-4241-8b47-b2c388545d14.jpeg">
</details>
""" # noqa

self.issue_body_moderated = """
<!-- @browser: Firefox 94.0 -->
<!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
<!-- @reported_with: desktop-reporter -->
<!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
<!-- @extra_labels: type-webrender-enabled -->
**URL**: http://aturemlaguerra.org/
**Browser / Version**: Firefox 94.0
**Operating System**: Mac OS X 10.15
**Tested Another Browser**: Yes Chrome
**Problem type**: Site is not usable
**Description**: Buttons or links not working
**Steps to Reproduce**:
gawrhs agrhsrthse sethsrthserhserhser
<details>
<summary>View the screenshot</summary>
Screenshot removed - possible explicit content.
</details>
""" # noqa

self.issue_body_no_screenshot = """
<!-- @browser: Firefox 94.0 -->
<!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0 -->
<!-- @reported_with: desktop-reporter -->
<!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2710 -->
<!-- @extra_labels: type-webrender-enabled -->
**URL**: http://aturemlaguerra.org/
**Browser / Version**: Firefox 94.0
**Operating System**: Mac OS X 10.15
**Tested Another Browser**: Yes Chrome
**Problem type**: Site is not usable
**Description**: Buttons or links not working
**Steps to Reproduce**:
gawrhs agrhsrthse sethsrthserhserhser
""" # noqa

def tearDown(self):
"""Tear down the tests."""
pass

def test_moderate_screenshot(self):
"""Moderate screenshot and remove image tag."""
body = moderate_screenshot(self.issue_body_with_screenshot)
self.assertEqual(body, self.issue_body_moderated)

def test_no_screenshot_unchanged(self):
"""Body remains unchanged if there is no screenshot."""
body = moderate_screenshot(self.issue_body_no_screenshot)
self.assertEqual(body, self.issue_body_no_screenshot)


if __name__ == '__main__':
unittest.main()
66 changes: 47 additions & 19 deletions tests/unit/test_webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import webcompat

from webcompat.db import SiteGlobal, SiteRegional
from webcompat.db import SiteGlobal, SiteRegional, SiteNSFW
from webcompat.helpers import to_bytes
from webcompat.webhooks import helpers, ml

Expand Down Expand Up @@ -160,6 +160,21 @@ def setUp(self):
<!-- @browser: Firefox Mobile (Tablet) 40.0 -->
"""

self.issue_body13 = """
<!-- @browser: Firefox 101.0 -->
<!-- @ua_header: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:101.0) Gecko/20100101 Firefox/101.0 -->
<!-- @reported_with: desktop-reporter -->
<!-- @public_url: https://github.com/webcompat/webcompat-tests/issues/2780 -->
<!-- @extra_labels: type-webrender-enabled -->
**URL**: http://pornhub.com/
""" # noqa

self.issue_body14 = """
**URL**: http://test.pornhub.com/
<!-- @browser: Firefox Mobile (Tablet) 40.0 -->
"""

self.issue_info1 = {
'action': 'foobar',
'state': 'open',
Expand Down Expand Up @@ -402,29 +417,14 @@ def test_priority_label_not_found(self, mock_func):
assert mock_func.call_count == 2
self.assertEqual(priority_label, None)

def test_get_domains(self):
"""Extract list of subdomains."""
self.assertEqual(
helpers.get_domains('www.example.com'), ['example.com']
)
self.assertEqual(
helpers.get_domains('sub.example.com'), ['example.com']
)
self.assertEqual(
helpers.get_domains('part.sub.example.com'),
['sub.example.com', 'example.com']
)
self.assertEqual(
helpers.get_domains('test'),
[]
)

@patch('webcompat.db.site_nsfw_db.query')
@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_get_issue_labels(self, gm, rm):
def test_get_issue_labels(self, gm, rm, nm):
"""Extract list of labels from an issue body."""
rm.return_value.filter_by.return_value.first.return_value = None
gm.return_value.filter_by.return_value.first.return_value = None
nm.return_value.filter_by.return_value.first.return_value = None

labels_tests = [
(self.issue_body, ['browser-firefox', 'type-media',
Expand All @@ -445,6 +445,34 @@ def test_get_issue_labels(self, gm, rm):
actual = helpers.get_issue_labels(issue_body)
self.assertEqual(sorted(expected), sorted(actual))

@patch('webcompat.db.site_nsfw_db.query')
def test_nsfw_label(self, nsfw_mock):
"""Extract nsfw label."""
nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW( # noqa
'pornhub.com'
)

nsfw_label = helpers.extract_nsfw_label(self.issue_body13)
self.assertEqual(nsfw_label, 'nsfw')

@patch('webcompat.db.site_nsfw_db.query')
def test_nsfw_label_subdomain(self, nsfw_mock):
"""Extract nsfw for a subdomain."""
nsfw_mock.return_value.filter_by.return_value.first.return_value = SiteNSFW( # noqa
'pornhub.com'
)

nsfw_label = helpers.extract_nsfw_label(self.issue_body14)
self.assertEqual(nsfw_label, 'nsfw')

@patch('webcompat.db.site_nsfw_db.query')
def test_nsfw_label_empty(self, nsfw_mock):
"""Site is not found in nsfw db"""
nsfw_mock.return_value.filter_by.return_value.first.return_value = None

nsfw_label = helpers.extract_nsfw_label(self.issue_body13)
self.assertEqual(nsfw_label, None)

def test_is_github_hook_missing_x_github_event(self):
"""Validation tests for GitHub Webhooks: Missing X-GitHub-Event."""
json_event, signature = event_data('new_event_invalid.json')
Expand Down
25 changes: 25 additions & 0 deletions webcompat/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@
autoflush=False,
bind=regional_site_engine))

site_nsfw_engine = create_engine('sqlite:///' + os.path.join(
app.config['DATA_PATH'], 'site-nsfw.db'))
site_nsfw_db = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=site_nsfw_engine))


UsersBase = declarative_base()
UsersBase.query = session_db.query_property()
Expand Down Expand Up @@ -111,3 +117,22 @@ def __init__(self, url, priority, country_code, ranking):


RegionalSiteBase.metadata.create_all(bind=regional_site_engine)


SiteNSFWBase = declarative_base()
SiteNSFWBase.query = site_nsfw_db.query_property()


class SiteNSFW(SiteNSFWBase):
"""Define the DB for NSFW domains."""

__tablename__ = 'site-nsfw'

url = Column(String, primary_key=True)

def __init__(self, url):
"""Initialize parameters of the NSFW domains DB."""
self.url = url


SiteNSFWBase.metadata.create_all(bind=site_nsfw_engine)
10 changes: 6 additions & 4 deletions webcompat/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from webcompat.helpers import get_os
from webcompat.helpers import get_details_list
from webcompat.helpers import is_json_object
from webcompat.nsfw import get_nsfw_label
from webcompat.nsfw import moderate_screenshot

SCHEMES = ('http://', 'https://')
BAD_SCHEMES = ('http:/', 'https:/', 'http:', 'https:')
Expand Down Expand Up @@ -537,11 +539,11 @@ def build_formdata(form_object):
details = form_object.get('details')
if details:
body += build_details(details)

if get_nsfw_label(domain):
body = moderate_screenshot(body)

body += get_console_logs_url(form_object.get('console_logs_url'))
# Add the image, if there was one.
if form_object.get('image_upload') is not None:
body += '\n\n![Screenshot of the site issue]({image_url})'.format(
image_url=form_object.get('image_upload').get('url'))
# Append "from webcompat.com" message to bottom (for GitHub issue viewers)
body += '\n\n{0}'.format(GITHUB_HELP)
rv = {'title': summary, 'body': body}
Expand Down
9 changes: 9 additions & 0 deletions webcompat/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,15 @@ def get_filename_from_url(uri):
return script_path


def get_domains(hostname):
"""Extract subdomains"""
subparts = hostname.split('.')
domains = ['.'.join(subparts[i:])
for i, subpart in enumerate(subparts)
if 0 < i < hostname.count('.')]
return domains


@app.context_processor
def register_get_filename_from_url():
return dict(get_filename_from_url=get_filename_from_url)
44 changes: 44 additions & 0 deletions webcompat/nsfw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Helpers methods for nsfw detection."""

import re

from webcompat.db import SiteNSFW
from webcompat.db import site_nsfw_db
from webcompat.helpers import get_domains

NSFW_LABEL = 'nsfw'


def is_url_nsfw(url):
return site_nsfw_db.query(SiteNSFW).filter_by(url=url).first()


def get_nsfw_label(hostname):
"""Query the nsfw DB for hostname or domain."""
if hostname:
if is_url_nsfw(hostname):
return NSFW_LABEL

# If hostname not found, try less-level domain (>2)
# If hostname is lv4.lv3.example.com, find lv3.example.com/example.com
domains = get_domains(hostname)
for domain in domains:
if is_url_nsfw(domain):
return NSFW_LABEL

return None


def moderate_screenshot(body):
pattern = re.compile(r'<img alt="Screenshot" [^>]*src="([^"]+)"[^>]*>')
clean_body = re.sub(
pattern,
'Screenshot removed - possible explicit content.',
body
)
return clean_body
Loading

0 comments on commit bcac122

Please sign in to comment.