Skip to content

Commit

Permalink
Merge pull request #5 from leonmoonen/main
Browse files Browse the repository at this point in the history
removed dependency on the CWE filename in MITRE's zipfile and improved logging
  • Loading branch information
leonmoonen authored Aug 7, 2022
2 parents bc549a7 + 2be886b commit 246293f
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 74 deletions.
24 changes: 10 additions & 14 deletions Code/collect_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def extract_project_links(df_master):
df_fixes = df_fixes.append(pd.Series(row), ignore_index=True)

df_fixes = df_fixes.drop_duplicates().reset_index(drop=True)
cf.logger.info('Number of collected references to vulnerability fixing commits:', len(df_fixes))
cf.logger.info(f'Number of collected references to vulnerability fixing commits: {len(df_fixes)}')
return df_fixes


Expand All @@ -111,7 +111,7 @@ def guess_pl(code):


def clean_string(signature):
return signature.strip().replace(' ','')
return signature.strip().replace(' ', '')


def get_method_code(source_code, start_line, end_line):
Expand All @@ -122,7 +122,7 @@ def get_method_code(source_code, start_line, end_line):
else:
return None
except Exception as e:
cf.logger.warning('Problem while getting method code from the file!', e)
cf.logger.warning(f'Problem while extracting method code from the changed file contents: {e}')
pass


Expand Down Expand Up @@ -161,19 +161,19 @@ def get_methods(file, file_change_id):
try:
if file.changed_methods:
cf.logger.debug('-' * 70)
cf.logger.debug('\nmethods_after: ')
cf.logger.debug('methods_after: ')
cf.logger.debug('- ' * 35)
for m in file.methods:
if m.name != '(anonymous)':
cf.logger.debug(m.long_name)

cf.logger.debug('\nmethods_before: ')
cf.logger.debug('methods_before: ')
cf.logger.debug('- ' * 35)
for mb in file.methods_before:
if mb.name != '(anonymous)':
cf.logger.debug(mb.long_name)

cf.logger.debug('\nchanged_methods: ')
cf.logger.debug('changed_methods: ')
cf.logger.debug('- ' * 35)
for mc in file.changed_methods:
if mc.name != '(anonymous)':
Expand All @@ -186,7 +186,7 @@ def get_methods(file, file_change_id):
# if clean_string(mc.long_name) == clean_string(mb.long_name) and mc.name != '(anonymous)':

if file.changed_methods:
methods_after, methods_before = changed_methods_both(file) # modified methods in source_code_after/_before
methods_after, methods_before = changed_methods_both(file) # in source_code_after/_before
if methods_before:
for mb in methods_before:
# filtering out code not existing, and (anonymous)
Expand Down Expand Up @@ -241,7 +241,7 @@ def get_methods(file, file_change_id):
return None

except Exception as e:
cf.logger.warning('Problem while fetching the methods!', e)
cf.logger.warning(f'Problem while fetching the methods: {e}')
pass


Expand Down Expand Up @@ -280,7 +280,6 @@ def get_files(commit):
'token_count': file.token_count,
'programming_language': programming_language,
}
file_methods = []
commit_files.append(file_row)
file_methods = get_methods(file, file_change_id)

Expand All @@ -292,7 +291,7 @@ def get_files(commit):
return commit_files, commit_methods

except Exception as e:
cf.logger.warning('Problem while fetching the files!', e)
cf.logger.warning(f'Problem while fetching the files: {e}')
pass


Expand Down Expand Up @@ -351,13 +350,11 @@ def extract_commits(repo_url, hashes):
'dmm_unit_size': commit.dmm_unit_size,
}
commit_files, commit_methods = get_files(commit)

repo_commits.append(commit_row)
repo_files.extend(commit_files)
repo_methods.extend(commit_methods)

except Exception as e:
cf.logger.warning('Problem while fetching the commits!', e)
cf.logger.warning(f'Problem while fetching the commits: {e}')
pass

if repo_commits:
Expand All @@ -378,5 +375,4 @@ def extract_commits(repo_url, hashes):
else:
df_repo_methods = None


return df_repo_commits, df_repo_files, df_repo_methods
33 changes: 12 additions & 21 deletions Code/collect_projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def filter_urls(urls):
return non_exist_urls



def convert_runtime(start_time, end_time):
"""
converts runtime of the slice of code more readable format
Expand All @@ -59,7 +58,6 @@ def convert_runtime(start_time, end_time):
return hours, minutes, seconds



def get_ref_links():
"""
retrieves reference links from CVE records to populate 'fixes' table
Expand All @@ -74,10 +72,9 @@ def get_ref_links():
df_master = pd.read_sql("SELECT * FROM cve", con=db.conn)
df_fixes = extract_project_links(df_master)

cf.logger.info('Checking if references still exist...')
cf.logger.info('Checking if references are still accessible...')
unique_urls = set(list(df_fixes.repo_url))

unfetched_urls = []
unfetched_urls = filter_urls(unique_urls)

if len(unfetched_urls) > 0:
Expand All @@ -101,13 +98,12 @@ def get_ref_links():
return df_fixes



def get_github_meta(repo_url, username, token):
"""
returns github meta-information of the repo_url
"""
owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
meta_row ={}
meta_row = {}

if username == 'None':
git_link = Github()
Expand All @@ -128,15 +124,14 @@ def get_github_meta(repo_url, username, token):
'stars_count': repo.stargazers_count,
'owner': owner}
except BadCredentialsException as e:
cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}\n', getattr(e, 'message', repr(e)))
cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}: {e}')
pass # or exit(1)
except Exception as e:
cf.logger.warning(f'Other issues while getting meta-data for GitHub repository {repo_url}\n', e)
cf.logger.warning(f'Other issues while getting meta-data for GitHub repository {repo_url}: {e}')
pass # or exit(1)
return meta_row



def save_repo_meta(repo_url):
"""
populate repository meta-information in repository table.
Expand All @@ -153,8 +148,7 @@ def save_repo_meta(repo_url):
else:
df_meta.to_sql(name='repository', con=db.conn, if_exists="replace", index=False)
except Exception as e:
cf.logger.warning('Problem while fetching repository meta-information\n', e)

cf.logger.warning(f'Problem while fetching repository meta-information: {e}')


def store_tables(df_fixes):
Expand All @@ -178,8 +172,7 @@ def store_tables(df_fixes):
df_single_repo = df_fixes[df_fixes.repo_url == repo_url]
hashes = list(df_single_repo.hash.unique())
cf.logger.info('-' * 70)
cf.logger.info('Retrieving fixes for repo', pcount, 'of', len(repo_urls),
'-', repo_url.rsplit("/")[-1])
cf.logger.info(f'Retrieving fixes for repo {pcount} of {len(repo_urls)} - {repo_url.rsplit("/")[-1]}')

# extract_commits method returns data at different granularity levels
df_commit, df_file, df_method = extract_commits(repo_url, hashes)
Expand All @@ -189,35 +182,33 @@ def store_tables(df_fixes):
# ----------------appending each project data to the tables-------------------------------
df_commit = df_commit.applymap(str)
df_commit.to_sql(name="commits", con=db.conn, if_exists="append", index=False)
cf.logger.debug('#Commits :', len(df_commit))
cf.logger.debug(f'#Commits: {len(df_commit)}')

if df_file is not None:
df_file = df_file.applymap(str)
df_file.to_sql(name="file_change", con=db.conn, if_exists="append", index=False)
cf.logger.debug('#Files :', len(df_file))
cf.logger.debug(f'#Files: {len(df_file)}')

if df_method is not None:
df_method = df_method.applymap(str)
df_method.to_sql(name="method_change", con=db.conn, if_exists="append", index=False)
cf.logger.debug('#Methods :', len(df_method))
cf.logger.debug(f'#Methods: {len(df_method)}')

save_repo_meta(repo_url)
else:
cf.logger.warning(f'Could not retrieve commit information from: {repo_url}\n')
cf.logger.warning(f'Could not retrieve commit information from: {repo_url}')

except Exception as e:
cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}\n', e)
cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}: {e}')
pass # skip fetching repository if is not available.


cf.logger.debug('-' * 70)
if db.table_exists('commits'):
commit_count = str(pd.read_sql("SELECT count(*) FROM commits", con=db.conn).iloc[0].iloc[0])
cf.logger.debug(f'Number of commits retrieved from all the repos: {commit_count}')
else:
cf.logger.warning('The commits table does not exist')


if db.table_exists('file_change'):
file_count = str(pd.read_sql("SELECT count(*) from file_change;", con=db.conn).iloc[0].iloc[0])
cf.logger.debug(f'Number of files changed by all the commits: {file_count}')
Expand Down Expand Up @@ -247,7 +238,7 @@ def store_tables(df_fixes):
if db.table_exists('method_change'):
prune_tables(cf.DATABASE)
else:
cf.logger.warning('Data pruning is not possible because there is not information in method_change table')
cf.logger.warning('Data pruning is not possible because there is no information in method_change table')

cf.logger.info('The database is up-to-date.')
cf.logger.info('-' * 70)
Expand Down
26 changes: 12 additions & 14 deletions Code/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@
DATABASE = Path(DATA_PATH) / DATABASE_NAME
config_read = False

log_level_map = { 'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL,
}
log_level_map = {'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}

logging.basicConfig(level=LOGGING_LEVEL,
format='%(asctime)s %(levelname)-3s: %(message)s',
format='%(asctime)s %(name)s %(levelname)s %(message)s',
datefmt='%m/%d/%Y %H:%M:%S')
logger = logging.getLogger(__name__)
logger = logging.getLogger('CVEfixes')
logger.removeHandler(sys.stderr)


Expand All @@ -36,7 +36,7 @@ def read_config() -> None:
Sets global constants with values found in the ini file.
"""
global DATA_PATH, DATABASE_NAME, DATABASE, USER, TOKEN, SAMPLE_LIMIT, NUM_WORKERS, config_read
global DATA_PATH, DATABASE_NAME, DATABASE, USER, TOKEN, SAMPLE_LIMIT, NUM_WORKERS, LOGGING_LEVEL, config_read

config = ConfigParser()
if config.read(['.CVEfixes.ini',
Expand All @@ -62,12 +62,10 @@ def read_config() -> None:
read_config()
logger.setLevel(LOGGING_LEVEL)
logging.getLogger("requests").setLevel(LOGGING_LEVEL)
logging.getLogger("urllib3").setLevel(LOGGING_LEVEL)
logging.getLogger("urllib3.connection").setLevel(LOGGING_LEVEL)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("urllib3.connection").setLevel(logging.WARNING)
logging.getLogger("pathlib").setLevel(LOGGING_LEVEL)
logging.getLogger("subprocess").setLevel(LOGGING_LEVEL)
logging.getLogger("h5py._conv").setLevel(LOGGING_LEVEL)
logging.getLogger("h5py._conv").setLevel(logging.WARNING)
logging.getLogger("git.cmd").setLevel(LOGGING_LEVEL)
logging.getLogger("github.Requester").setLevel(LOGGING_LEVEL)


6 changes: 3 additions & 3 deletions Code/cve_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def import_cves():

# Check if the directory already has the json file or not ?
if os.path.isfile(Path(cf.DATA_PATH) / 'json' / extract_target):
cf.logger.warning('Reusing', year, 'CVE json file that was downloaded earlier...')
cf.logger.warning(f'Reusing the {year} CVE json file that was downloaded earlier...')
json_file = Path(cf.DATA_PATH) / 'json' / extract_target
else:
# url_to_open = urlopen(zip_file_url, timeout=10)
Expand All @@ -132,7 +132,7 @@ def import_cves():
df_cve = pd.DataFrame(yearly_data)
else:
df_cve = df_cve.append(pd.DataFrame(yearly_data))
cf.logger.info(str(year), 'CVE json file has been merged')
cf.logger.info(f'The CVE json for {year} has been merged')

df_cve = preprocess_jsons(df_cve)
df_cve = df_cve.applymap(str)
Expand All @@ -157,7 +157,7 @@ def import_cves():

no_ref_cwes = set(list(df_cwes_class.cwe_id)).difference(set(list(df_cwes.cwe_id)))
if len(no_ref_cwes) > 0:
cf.logger.debug('List of CWEs from CVEs that are not associated to cwe table are as follows:- ')
cf.logger.debug('List of CWEs from CVEs that are not associated to cwe table are as follows:')
cf.logger.debug(no_ref_cwes)

# Applying the assertion to cve-, cwe- and cwe_classification table.
Expand Down
25 changes: 15 additions & 10 deletions Code/extract_cwe_record.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import ast
import json
import os
import time
import fnmatch
import xml.etree.ElementTree as et
import pandas as pd
from pathlib import Path
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from pandas import json_normalize

import configuration as cf

# --------------------------------------------------------------------------------------------------------
Expand All @@ -20,22 +20,27 @@ def extract_cwe():
:return df_CWE: dataframe of CWE category table
"""

if os.path.isdir(cf.DATA_PATH + "cwec_v4.4.xml"):
cf.logger.info("Reusing the CWE XML file that is already in the directory")
xtree = et.parse(cf.DATA_PATH + "cwec_v4.4.xml")
cwe_doc = sorted(Path(cf.DATA_PATH).glob('cwec_*.xml'))
if len(cwe_doc) > 0:
cf.logger.info('Reusing the CWE XML file that is already in the directory')
xtree = et.parse(cf.DATA_PATH + cwe_doc[-1])
else:
cwe_csv_url = "https://cwe.mitre.org/data/xml/cwec_latest.xml.zip"
cwe_csv_url = 'https://cwe.mitre.org/data/xml/cwec_latest.xml.zip'
cwe_zip = ZipFile(BytesIO(urlopen(cwe_csv_url).read()))
cwefile = cwe_zip.extract("cwec_v4.4.xml", cf.DATA_PATH)
xtree = et.parse(cwefile)
cwe_doc = sorted(fnmatch.filter(cwe_zip.namelist(),'cwec_*.xml')) # assumes all files at top level
assert len(cwe_doc) > 0, \
'Cannot find a CWE XML file in https://cwe.mitre.org/data/xml/cwec_latest.xml.zip'
cf.logger.info(f'Extracting CWE data from {cwe_doc[-1]}')
cwe_file = cwe_zip.extract(cwe_doc[-1], cf.DATA_PATH)
xtree = et.parse(cwe_file)
time.sleep(2)

xroot = xtree.getroot()
cat_flag = 0
rows = []

for parents in xroot[0:2]: # taking only 0, 1 and 2 (index 0 is for weaknesses, 1 for Categories, 2 for Views, 3 for External_References)

# include only types 0, 1 and 2 (0 is for weaknesses, 1 for Categories, 2 for Views, 3 for External_References)
for parents in xroot[0:2]:
for node in parents:
cwe_id = 'CWE-' + str(node.attrib['ID'])
cwe_name = node.attrib['Name'] if node.attrib['Name'] is not None else None
Expand Down
Loading

0 comments on commit 246293f

Please sign in to comment.