Skip to content

Commit

Permalink
Merge pull request #2912 from chaoss/main
Browse files Browse the repository at this point in the history
Updating Dev Branch
  • Loading branch information
sgoggins authored Sep 25, 2024
2 parents aae2f19 + c8eba65 commit b75a902
Show file tree
Hide file tree
Showing 9 changed files with 300 additions and 41 deletions.
14 changes: 14 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This CITATION.cff reference content was generated from Zotero.
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: Goggins
given-names: Sean
- family-names: Lumbard
given-names: Kevin
- family-names: Germonprez
given-names: Matt
title: "Open Source Community Health: Analytical Metrics and Their Corresponding Narratives"
doi: 10.1109/SoHeal52568.2021.00010
date-released: 2021
url: https://www.seangoggins.net/wp-content/plugins/zotpress/lib/request/request.dl.php?api_user_id=655145&dlkey=HNG22ZSU&content_type=application/pdf
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.76.1
# Augur NEW Release v0.76.2

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
Expand All @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
Expand Down
158 changes: 158 additions & 0 deletions augur/api/metrics/deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,162 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No
return results


@register_metric()
def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None):
"""
Returns a list of all the dependencies in a project/repo/repo_group.
:param repo_id: The repository's id
:param repo_group_id: The repository's group id
:param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of persons/period
"""

if not begin_date:
begin_date = '1970-1-1 00:00:01'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

if repo_id:

libyearSQL = s.sql.text("""
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
AND C.repo_id = :repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id;
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_id': repo_id})

else:

libyearSQL = s.sql.text("""
Select w.* from
(
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id) w,
repo_groups y,
repo z
where w.repo_id=z.repo_id and
y.repo_group_id=z.repo_group_id
and z.repo_group_id = :repo_group_id
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_group_id': repo_group_id})
return results

16 changes: 16 additions & 0 deletions augur/application/db/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,22 @@ def get_working_commits_by_repo_id(repo_id):

return working_commits

def get_missing_commit_message_hashes(repo_id):

fetch_missing_hashes_sql = s.sql.text("""
SELECT DISTINCT cmt_commit_hash FROM commits
WHERE repo_id=:repo_id
AND cmt_commit_hash NOT IN
(SELECT DISTINCT cmt_hash FROM commit_messages WHERE repo_id=:repo_id);
""").bindparams(repo_id=repo_id)

try:
missing_commit_hashes = fetchall_data_from_sql_text(fetch_missing_hashes_sql)
except:
missing_commit_hashes = []

return missing_commit_hashes

def get_worker_oauth_keys(platform: str):

with get_session() as session:
Expand Down
49 changes: 48 additions & 1 deletion augur/tasks/git/facade_tasks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#SPDX-License-Identifier: MIT

import logging
import datetime
from celery import group, chain

from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts
from subprocess import check_output
from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts, get_missing_commit_message_hashes

from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits
from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set
Expand Down Expand Up @@ -157,6 +159,50 @@ def facade_start_contrib_analysis_task():
facade_helper.update_status('Updating Contributors')
facade_helper.log_activity('Info', 'Updating Contributors with commits')

@celery.task(base=AugurFacadeRepoCollectionTask)
def facade_fetch_missing_commit_messages(repo_git):
logger = logging.getLogger(facade_fetch_missing_commit_messages.__name__)
facade_helper = FacadeHelper(logger)

repo = get_repo_by_repo_git(repo_git)

logger.debug(f"Fetching missing commit message records for repo {repo_git}")

missing_message_hashes = get_missing_commit_message_hashes(repo.repo_id)

to_insert = []

for hash in missing_message_hashes:
#Get the huge list of commits to process.
absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name)
repo_loc = (f"{absolute_path}/.git")

try:
commit_message = check_output(
f"git --git-dir {repo_loc} log --format=%B -n 1 {hash}".split()
).decode('utf-8').strip()

msg_record = {
'repo_id' : repo.repo_id,
'cmt_msg' : commit_message,
'cmt_hash' : hash,
'tool_source' : 'Facade',
'tool_version' : '0.78?',
'data_source' : 'git',
'data_collection_date' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

if len(to_insert) >= 1000:
bulk_insert_dicts(logger,to_insert, CommitMessage, ["repo_id","cmt_hash"])
to_insert = []

to_insert.append(msg_record)
except Exception as e:
logger.info(f'The exception is : {e}.')

if to_insert:
bulk_insert_dicts(logger, to_insert, CommitMessage, ["repo_id","cmt_hash"])


#enable celery multithreading
@celery.task(base=AugurFacadeRepoCollectionTask)
Expand Down Expand Up @@ -354,6 +400,7 @@ def generate_analysis_sequence(logger,repo_git, facade_helper):

analysis_sequence.append(trim_commits_post_analysis_facade_task.si(repo_git))

analysis_sequence.append(facade_fetch_missing_commit_messages.si(repo_git))

analysis_sequence.append(facade_analysis_end_facade_task.si())

Expand Down
5 changes: 3 additions & 2 deletions docs/new-install.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,9 @@ alter database augur owner to augur;

**If you're using PostgreSQL 15 or later**, default database permissions will prevent Augur's installer from configuring the database. Add one last line after the above to fix this:
```sql
set search_path=augur;
grant all privileges on schema public to augur with grant option;
\connect augur
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO augur;
GRANT ALL PRIVILEGES ON SCHEMA public TO augur;
```

After that, return to your user by exiting `psql`
Expand Down
6 changes: 6 additions & 0 deletions docs/rabbitmq-troubleshooting.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#Troubleshooting RabbitMQ Issues:

set rabbitmq consumer timeout to 200 hours
```
sudo rabbitmqctl eval 'application:set_env(rabbit, consumer_timeout, 720000000).'
```
4 changes: 2 additions & 2 deletions metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

__short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection"

__version__ = "0.76.1"
__release__ = "v0.76.1 (For the Good of the Data)"
__version__ = "0.76.2"
__release__ = "v0.76.2 (Pumpkin Space)"

__license__ = "MIT"
__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024"
Loading

0 comments on commit b75a902

Please sign in to comment.