Skip to content

Commit

Permalink
chore: fix pre-commit failures
Browse files Browse the repository at this point in the history
Pre-commit hooks used to fail on the project's master branch when
running on all files; this commit fixes that.
  • Loading branch information
m4tx committed May 13, 2022
1 parent e5c10fe commit f0aeeb6
Show file tree
Hide file tree
Showing 11 changed files with 105 additions and 69 deletions.
7 changes: 6 additions & 1 deletion backend/fregepoc/celery_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
# set the default Django settings module for the 'celery' program.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fregepoc.settings")

app = Celery("frege", worker_max_tasks_per_child=os.getenv("CELERY_WORKER_MAX_TASKS_PER_CHILD", default=4))
app = Celery(
"frege",
worker_max_tasks_per_child=os.getenv(
"CELERY_WORKER_MAX_TASKS_PER_CHILD", default=4
),
)

# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
class Migration(migrations.Migration):

dependencies = [
('indexers', '0002_alter_githubindexer_current_page'),
("indexers", "0002_alter_githubindexer_current_page"),
]

operations = [
migrations.AlterModelOptions(
name='githubindexer',
options={'verbose_name': 'Github Indexer', 'verbose_name_plural': 'Github Indexer'},
name="githubindexer",
options={
"verbose_name": "Github Indexer",
"verbose_name_plural": "Github Indexer",
},
),
]
21 changes: 11 additions & 10 deletions backend/fregepoc/indexers/migrations/0004_add_sourceforgeindexer.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,40 @@
# Generated by Django 4.0.3 on 2022-04-08 19:35

from django.db import migrations, models

import fregepoc.indexers.models


class Migration(migrations.Migration):

dependencies = [
('indexers', '0003_alter_githubindexer_options'),
("indexers", "0003_alter_githubindexer_options"),
]

operations = [
migrations.CreateModel(
name='SourceforgeIndexer',
name="SourceforgeIndexer",
fields=[
(
'id',
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name='ID'
)
verbose_name="ID",
),
),
(
'current_page',
"current_page",
models.PositiveIntegerField(
default=1,
help_text='The last visited page.',
verbose_name='current page'
)
help_text="The last visited page.",
verbose_name="current page",
),
),
],
options={
'abstract': False,
"abstract": False,
},
bases=(fregepoc.indexers.models.BaseIndexer,),
),
Expand Down
50 changes: 31 additions & 19 deletions backend/fregepoc/indexers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
from github import Github

from fregepoc.indexers.base import BaseIndexer
from fregepoc.indexers.sourceforge import (
SinglePageProjectsExtractor,
SingleProjectCodeUrlExtractor,
SingleProjectGitLinkExtractor,
SingleProjectGitUrlExtractor,
SingleProjectResponseExtractor,
)
from fregepoc.repositories.models import Repository

from fregepoc.indexers.sourceforge import SinglePageProjectsExtractor
from fregepoc.indexers.sourceforge import SingleProjectCodeUrlExtractor
from fregepoc.indexers.sourceforge import SingleProjectGitLinkExtractor
from fregepoc.indexers.sourceforge import SingleProjectGitUrlExtractor
from fregepoc.indexers.sourceforge import SingleProjectResponseExtractor


class GitHubIndexer(BaseIndexer):
min_forks = models.PositiveIntegerField(
Expand Down Expand Up @@ -94,8 +95,9 @@ def main_loop(self):

self.current_page += 1
if self.current_page >= 1000:
# The maximum page on the SourceForge is 999. When we reach the limit, we just start over.
# In the future, we may want to change by using some filters (categories) to scrap more data.
# The maximum page on the SourceForge is 999. When we reach
# the limit, we just start over. In the future, we may want
# to change by using some filters (categories) to scrap more data.
self.current_page = 0
self.save(update_fields=["current_page"])

Expand All @@ -105,15 +107,25 @@ def main_loop(self):
for project_name in projects:
repos_to_process = []

single_project_soup = self.singleProjectResponseExtractor.extract(project_name)
single_project_soup = self.singleProjectResponseExtractor.extract(
project_name
)

project_code_url = self.singleProjectCodeUrlExtractor.extract(single_project_soup)
repo_from_code_url = self.handle_code_url(project_name, project_code_url)
project_code_url = self.singleProjectCodeUrlExtractor.extract(
single_project_soup
)
repo_from_code_url = self.handle_code_url(
project_name, project_code_url
)
if repo_from_code_url is not None:
repos_to_process.append(repo_from_code_url)

project_git_ulr = self.singleProjectGitUrlExtractor.extract(single_project_soup)
repos_to_process.extend(self.handle_git_url(project_name, project_git_ulr))
project_git_ulr = self.singleProjectGitUrlExtractor.extract(
single_project_soup
)
repos_to_process.extend(
self.handle_git_url(project_name, project_git_ulr)
)

Repository.objects.bulk_create(repos_to_process)
return repos_to_process
Expand All @@ -122,7 +134,9 @@ def handle_code_url(self, project_name, project_code_url):
if project_code_url is None:
return None

project_git_url_from_code_url = self.singleProjectGitLinkExtractor.extract(project_code_url)
project_git_url_from_code_url = (
self.singleProjectGitLinkExtractor.extract(project_code_url)
)
if project_git_url_from_code_url is None:
return None

Expand All @@ -136,10 +150,7 @@ def handle_code_url(self, project_name, project_code_url):

def handle_git_url(self, project_name, project_git_urls):
extracted_git_url = [
(
subproject,
self.singleProjectGitLinkExtractor.extract(git_url)
)
(subproject, self.singleProjectGitLinkExtractor.extract(git_url))
for (subproject, git_url) in project_git_urls
]

Expand All @@ -151,7 +162,8 @@ def handle_git_url(self, project_name, project_git_urls):
repo_url=git_url, # TODO
commit_hash="HEAD", # TODO
)
for (subproject, git_url) in extracted_git_url if git_url is not None
for (subproject, git_url) in extracted_git_url
if git_url is not None
]

class Meta:
Expand Down
4 changes: 2 additions & 2 deletions backend/fregepoc/indexers/sourceforge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .single_page_projects_extractor import SinglePageProjectsExtractor
from .single_project_response_extractor import SingleProjectResponseExtractor
from .single_project_git_link_extractor import SingleProjectGitLinkExtractor
from .single_project_code_url_exctractor import SingleProjectCodeUrlExtractor
from .single_project_git_link_extractor import SingleProjectGitLinkExtractor
from .single_project_git_url_extractor import SingleProjectGitUrlExtractor
from .single_project_response_extractor import SingleProjectResponseExtractor
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,19 @@ class SinglePageProjectsExtractor:
def extract(page_number):
if page_number <= 0:
return None
url = f'https://sourceforge.net/directory/?sort=popular&page={page_number}'
url = (
f"https://sourceforge.net/directory/?sort=popular"
f"&page={page_number}"
)
response = requests.get(url)

if response.status_code == 404:
return None

soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, "html.parser")

projects_set = set()
for link in soup.find_all('a', href=re.compile(r'/projects/\w+')):
projects_set.add('/'.join(link['href'].split('/')[:3])[1:])
for link in soup.find_all("a", href=re.compile(r"/projects/\w+")):
projects_set.add("/".join(link["href"].split("/")[:3])[1:])

return projects_set
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ def extract(soup):
return

code_urls = set()
for span in soup.find_all('span'):
if span.text == 'Code':
code_urls.add(span.find_parents('a')[0]['href'][1:])
for span in soup.find_all("span"):
if span.text == "Code":
code_urls.add(span.find_parents("a")[0]["href"][1:])

return code_urls
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@


class SingleProjectGitLinkExtractor:

@staticmethod
def extract(code_url):
if not code_url:
return

url = f'https://sourceforge.net/{code_url}'
url = f"https://sourceforge.net/{code_url}"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, "html.parser")

value = soup.find('input', {'id': 'access_url'})
value = soup.find("input", {"id": "access_url"})
if value:
value = value.get('value')
if value.startswith('git clone'):
value = value.get("value")
if value.startswith("git clone"):
git_link = value.split()[2]
return git_link
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,25 @@ def extract(soup):
if not soup:
return code_urls

for li in soup.find_all('ul', {'class': 'dropdown'})[0]('li'):
for li in soup.find_all("ul", {"class": "dropdown"})[0]("li"):
try:
a = li('a')[0]
if a('span')[0].text.startswith('Git'):
href_link = a['href']
if href_link.startswith('/p'):
url = f'https://sourceforge.net/{href_link[1:]}'
a = li("a")[0]
if a("span")[0].text.startswith("Git"):
href_link = a["href"]
if href_link.startswith("/p"):
url = f"https://sourceforge.net/{href_link[1:]}"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, "html.parser")

for link in soup.find_all('div', {'class': 'list card'}):
element = link('a')[0]
cleaned_link = element['href']
if cleaned_link.startswith('/p'):
for link in soup.find_all(
"div", {"class": "list card"}
):
element = link("a")[0]
cleaned_link = element["href"]
if cleaned_link.startswith("/p"):
code_urls.add((element.text, cleaned_link[1:]))
except:
except Exception:
pass

return code_urls
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ def extract(project_name):
if not project_name:
return

url = f'https://sourceforge.net/{project_name}'
url = f"https://sourceforge.net/{project_name}"
response = requests.get(url)
return BeautifulSoup(response.text, 'html.parser')
return BeautifulSoup(response.text, "html.parser")
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,30 @@
class Migration(migrations.Migration):

dependencies = [
('repositories', '0001_initial'),
("repositories", "0001_initial"),
]

operations = [
migrations.AlterField(
model_name='repository',
name='description',
field=models.TextField(blank=True, default='', help_text='The description of the repository', max_length=2048, verbose_name='Repository description'),
model_name="repository",
name="description",
field=models.TextField(
blank=True,
default="",
help_text="The description of the repository",
max_length=2048,
verbose_name="Repository description",
),
preserve_default=False,
),
migrations.AlterField(
model_name='repositoryfile',
name='repo_relative_file_path',
field=models.CharField(blank=True, help_text='File path, relative to the repository root.', max_length=512, null=True),
model_name="repositoryfile",
name="repo_relative_file_path",
field=models.CharField(
blank=True,
help_text="File path, relative to the repository root.",
max_length=512,
null=True,
),
),
]

0 comments on commit f0aeeb6

Please sign in to comment.