From f0aeeb6bf8cc6dc517e67d33286afaeebfda217e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Ma=C4=87kowski?= Date: Fri, 13 May 2022 14:50:17 +0200 Subject: [PATCH] chore: fix pre-commit failures Pre-commit hooks used to fail on the project's master branch when running on all files; this commit fixes that. --- backend/fregepoc/celery_app.py | 7 ++- .../0003_alter_githubindexer_options.py | 9 ++-- .../migrations/0004_add_sourceforgeindexer.py | 21 ++++---- backend/fregepoc/indexers/models.py | 50 ++++++++++++------- .../fregepoc/indexers/sourceforge/__init__.py | 4 +- .../single_page_projects_extractor.py | 11 ++-- .../single_project_code_url_exctractor.py | 6 +-- .../single_project_git_link_extractor.py | 11 ++-- .../single_project_git_url_extractor.py | 26 +++++----- .../single_project_response_extractor.py | 4 +- ...2_alter_repository_description_and_more.py | 25 +++++++--- 11 files changed, 105 insertions(+), 69 deletions(-) diff --git a/backend/fregepoc/celery_app.py b/backend/fregepoc/celery_app.py index 04574f5..1eb3844 100644 --- a/backend/fregepoc/celery_app.py +++ b/backend/fregepoc/celery_app.py @@ -5,7 +5,12 @@ # set the default Django settings module for the 'celery' program. os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fregepoc.settings") -app = Celery("frege", worker_max_tasks_per_child=os.getenv("CELERY_WORKER_MAX_TASKS_PER_CHILD", default=4)) +app = Celery( + "frege", + worker_max_tasks_per_child=os.getenv( + "CELERY_WORKER_MAX_TASKS_PER_CHILD", default=4 + ), +) # Using a string here means the worker doesn't have to serialize # the configuration object to child processes. diff --git a/backend/fregepoc/indexers/migrations/0003_alter_githubindexer_options.py b/backend/fregepoc/indexers/migrations/0003_alter_githubindexer_options.py index c2e17bc..b3ce25e 100644 --- a/backend/fregepoc/indexers/migrations/0003_alter_githubindexer_options.py +++ b/backend/fregepoc/indexers/migrations/0003_alter_githubindexer_options.py @@ -6,12 +6,15 @@ class Migration(migrations.Migration): dependencies = [ - ('indexers', '0002_alter_githubindexer_current_page'), + ("indexers", "0002_alter_githubindexer_current_page"), ] operations = [ migrations.AlterModelOptions( - name='githubindexer', - options={'verbose_name': 'Github Indexer', 'verbose_name_plural': 'Github Indexer'}, + name="githubindexer", + options={ + "verbose_name": "Github Indexer", + "verbose_name_plural": "Github Indexer", + }, ), ] diff --git a/backend/fregepoc/indexers/migrations/0004_add_sourceforgeindexer.py b/backend/fregepoc/indexers/migrations/0004_add_sourceforgeindexer.py index 1c67dd1..3442181 100644 --- a/backend/fregepoc/indexers/migrations/0004_add_sourceforgeindexer.py +++ b/backend/fregepoc/indexers/migrations/0004_add_sourceforgeindexer.py @@ -1,39 +1,40 @@ # Generated by Django 4.0.3 on 2022-04-08 19:35 from django.db import migrations, models + import fregepoc.indexers.models class Migration(migrations.Migration): dependencies = [ - ('indexers', '0003_alter_githubindexer_options'), + ("indexers", "0003_alter_githubindexer_options"), ] operations = [ migrations.CreateModel( - name='SourceforgeIndexer', + name="SourceforgeIndexer", fields=[ ( - 'id', + "id", models.BigAutoField( auto_created=True, primary_key=True, serialize=False, - verbose_name='ID' - ) + verbose_name="ID", + ), ), ( - 'current_page', + "current_page", models.PositiveIntegerField( default=1, - help_text='The last visited page.', - verbose_name='current page' - ) + help_text="The last visited page.", + verbose_name="current page", + ), ), ], options={ - 'abstract': False, + "abstract": False, }, bases=(fregepoc.indexers.models.BaseIndexer,), ), diff --git a/backend/fregepoc/indexers/models.py b/backend/fregepoc/indexers/models.py index b971362..774f5a3 100644 --- a/backend/fregepoc/indexers/models.py +++ b/backend/fregepoc/indexers/models.py @@ -7,14 +7,15 @@ from github import Github from fregepoc.indexers.base import BaseIndexer +from fregepoc.indexers.sourceforge import ( + SinglePageProjectsExtractor, + SingleProjectCodeUrlExtractor, + SingleProjectGitLinkExtractor, + SingleProjectGitUrlExtractor, + SingleProjectResponseExtractor, +) from fregepoc.repositories.models import Repository -from fregepoc.indexers.sourceforge import SinglePageProjectsExtractor -from fregepoc.indexers.sourceforge import SingleProjectCodeUrlExtractor -from fregepoc.indexers.sourceforge import SingleProjectGitLinkExtractor -from fregepoc.indexers.sourceforge import SingleProjectGitUrlExtractor -from fregepoc.indexers.sourceforge import SingleProjectResponseExtractor - class GitHubIndexer(BaseIndexer): min_forks = models.PositiveIntegerField( @@ -94,8 +95,9 @@ def main_loop(self): self.current_page += 1 if self.current_page >= 1000: - # The maximum page on the SourceForge is 999. When we reach the limit, we just start over. - # In the future, we may want to change by using some filters (categories) to scrap more data. + # The maximum page on the SourceForge is 999. When we reach + # the limit, we just start over. In the future, we may want + # to change by using some filters (categories) to scrap more data. self.current_page = 0 self.save(update_fields=["current_page"]) @@ -105,15 +107,25 @@ def main_loop(self): for project_name in projects: repos_to_process = [] - single_project_soup = self.singleProjectResponseExtractor.extract(project_name) + single_project_soup = self.singleProjectResponseExtractor.extract( + project_name + ) - project_code_url = self.singleProjectCodeUrlExtractor.extract(single_project_soup) - repo_from_code_url = self.handle_code_url(project_name, project_code_url) + project_code_url = self.singleProjectCodeUrlExtractor.extract( + single_project_soup + ) + repo_from_code_url = self.handle_code_url( + project_name, project_code_url + ) if repo_from_code_url is not None: repos_to_process.append(repo_from_code_url) - project_git_ulr = self.singleProjectGitUrlExtractor.extract(single_project_soup) - repos_to_process.extend(self.handle_git_url(project_name, project_git_ulr)) + project_git_ulr = self.singleProjectGitUrlExtractor.extract( + single_project_soup + ) + repos_to_process.extend( + self.handle_git_url(project_name, project_git_ulr) + ) Repository.objects.bulk_create(repos_to_process) return repos_to_process @@ -122,7 +134,9 @@ def handle_code_url(self, project_name, project_code_url): if project_code_url is None: return None - project_git_url_from_code_url = self.singleProjectGitLinkExtractor.extract(project_code_url) + project_git_url_from_code_url = ( + self.singleProjectGitLinkExtractor.extract(project_code_url) + ) if project_git_url_from_code_url is None: return None @@ -136,10 +150,7 @@ def handle_code_url(self, project_name, project_code_url): def handle_git_url(self, project_name, project_git_urls): extracted_git_url = [ - ( - subproject, - self.singleProjectGitLinkExtractor.extract(git_url) - ) + (subproject, self.singleProjectGitLinkExtractor.extract(git_url)) for (subproject, git_url) in project_git_urls ] @@ -151,7 +162,8 @@ def handle_git_url(self, project_name, project_git_urls): repo_url=git_url, # TODO commit_hash="HEAD", # TODO ) - for (subproject, git_url) in extracted_git_url if git_url is not None + for (subproject, git_url) in extracted_git_url + if git_url is not None ] class Meta: diff --git a/backend/fregepoc/indexers/sourceforge/__init__.py b/backend/fregepoc/indexers/sourceforge/__init__.py index 4905201..4486c61 100644 --- a/backend/fregepoc/indexers/sourceforge/__init__.py +++ b/backend/fregepoc/indexers/sourceforge/__init__.py @@ -1,5 +1,5 @@ from .single_page_projects_extractor import SinglePageProjectsExtractor -from .single_project_response_extractor import SingleProjectResponseExtractor -from .single_project_git_link_extractor import SingleProjectGitLinkExtractor from .single_project_code_url_exctractor import SingleProjectCodeUrlExtractor +from .single_project_git_link_extractor import SingleProjectGitLinkExtractor from .single_project_git_url_extractor import SingleProjectGitUrlExtractor +from .single_project_response_extractor import SingleProjectResponseExtractor diff --git a/backend/fregepoc/indexers/sourceforge/single_page_projects_extractor.py b/backend/fregepoc/indexers/sourceforge/single_page_projects_extractor.py index 0dcf1d5..76a876f 100644 --- a/backend/fregepoc/indexers/sourceforge/single_page_projects_extractor.py +++ b/backend/fregepoc/indexers/sourceforge/single_page_projects_extractor.py @@ -9,16 +9,19 @@ class SinglePageProjectsExtractor: def extract(page_number): if page_number <= 0: return None - url = f'https://sourceforge.net/directory/?sort=popular&page={page_number}' + url = ( + f"https://sourceforge.net/directory/?sort=popular" + f"&page={page_number}" + ) response = requests.get(url) if response.status_code == 404: return None - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") projects_set = set() - for link in soup.find_all('a', href=re.compile(r'/projects/\w+')): - projects_set.add('/'.join(link['href'].split('/')[:3])[1:]) + for link in soup.find_all("a", href=re.compile(r"/projects/\w+")): + projects_set.add("/".join(link["href"].split("/")[:3])[1:]) return projects_set diff --git a/backend/fregepoc/indexers/sourceforge/single_project_code_url_exctractor.py b/backend/fregepoc/indexers/sourceforge/single_project_code_url_exctractor.py index 8d8f984..da262ec 100644 --- a/backend/fregepoc/indexers/sourceforge/single_project_code_url_exctractor.py +++ b/backend/fregepoc/indexers/sourceforge/single_project_code_url_exctractor.py @@ -5,8 +5,8 @@ def extract(soup): return code_urls = set() - for span in soup.find_all('span'): - if span.text == 'Code': - code_urls.add(span.find_parents('a')[0]['href'][1:]) + for span in soup.find_all("span"): + if span.text == "Code": + code_urls.add(span.find_parents("a")[0]["href"][1:]) return code_urls diff --git a/backend/fregepoc/indexers/sourceforge/single_project_git_link_extractor.py b/backend/fregepoc/indexers/sourceforge/single_project_git_link_extractor.py index bf7c9a6..b5a4ff2 100644 --- a/backend/fregepoc/indexers/sourceforge/single_project_git_link_extractor.py +++ b/backend/fregepoc/indexers/sourceforge/single_project_git_link_extractor.py @@ -3,19 +3,18 @@ class SingleProjectGitLinkExtractor: - @staticmethod def extract(code_url): if not code_url: return - url = f'https://sourceforge.net/{code_url}' + url = f"https://sourceforge.net/{code_url}" response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") - value = soup.find('input', {'id': 'access_url'}) + value = soup.find("input", {"id": "access_url"}) if value: - value = value.get('value') - if value.startswith('git clone'): + value = value.get("value") + if value.startswith("git clone"): git_link = value.split()[2] return git_link diff --git a/backend/fregepoc/indexers/sourceforge/single_project_git_url_extractor.py b/backend/fregepoc/indexers/sourceforge/single_project_git_url_extractor.py index a325506..1b41f16 100644 --- a/backend/fregepoc/indexers/sourceforge/single_project_git_url_extractor.py +++ b/backend/fregepoc/indexers/sourceforge/single_project_git_url_extractor.py @@ -10,23 +10,25 @@ def extract(soup): if not soup: return code_urls - for li in soup.find_all('ul', {'class': 'dropdown'})[0]('li'): + for li in soup.find_all("ul", {"class": "dropdown"})[0]("li"): try: - a = li('a')[0] - if a('span')[0].text.startswith('Git'): - href_link = a['href'] - if href_link.startswith('/p'): - url = f'https://sourceforge.net/{href_link[1:]}' + a = li("a")[0] + if a("span")[0].text.startswith("Git"): + href_link = a["href"] + if href_link.startswith("/p"): + url = f"https://sourceforge.net/{href_link[1:]}" response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") - for link in soup.find_all('div', {'class': 'list card'}): - element = link('a')[0] - cleaned_link = element['href'] - if cleaned_link.startswith('/p'): + for link in soup.find_all( + "div", {"class": "list card"} + ): + element = link("a")[0] + cleaned_link = element["href"] + if cleaned_link.startswith("/p"): code_urls.add((element.text, cleaned_link[1:])) - except: + except Exception: pass return code_urls diff --git a/backend/fregepoc/indexers/sourceforge/single_project_response_extractor.py b/backend/fregepoc/indexers/sourceforge/single_project_response_extractor.py index a3097a3..9aae9fa 100644 --- a/backend/fregepoc/indexers/sourceforge/single_project_response_extractor.py +++ b/backend/fregepoc/indexers/sourceforge/single_project_response_extractor.py @@ -8,6 +8,6 @@ def extract(project_name): if not project_name: return - url = f'https://sourceforge.net/{project_name}' + url = f"https://sourceforge.net/{project_name}" response = requests.get(url) - return BeautifulSoup(response.text, 'html.parser') + return BeautifulSoup(response.text, "html.parser") diff --git a/backend/fregepoc/repositories/migrations/0002_alter_repository_description_and_more.py b/backend/fregepoc/repositories/migrations/0002_alter_repository_description_and_more.py index 155a1f4..fce794e 100644 --- a/backend/fregepoc/repositories/migrations/0002_alter_repository_description_and_more.py +++ b/backend/fregepoc/repositories/migrations/0002_alter_repository_description_and_more.py @@ -6,19 +6,30 @@ class Migration(migrations.Migration): dependencies = [ - ('repositories', '0001_initial'), + ("repositories", "0001_initial"), ] operations = [ migrations.AlterField( - model_name='repository', - name='description', - field=models.TextField(blank=True, default='', help_text='The description of the repository', max_length=2048, verbose_name='Repository description'), + model_name="repository", + name="description", + field=models.TextField( + blank=True, + default="", + help_text="The description of the repository", + max_length=2048, + verbose_name="Repository description", + ), preserve_default=False, ), migrations.AlterField( - model_name='repositoryfile', - name='repo_relative_file_path', - field=models.CharField(blank=True, help_text='File path, relative to the repository root.', max_length=512, null=True), + model_name="repositoryfile", + name="repo_relative_file_path", + field=models.CharField( + blank=True, + help_text="File path, relative to the repository root.", + max_length=512, + null=True, + ), ), ]