Merge pull request #5 from leonmoonen/main

removed dependency on the CWE filename in MITRE's zipfile and improved logging
secureIT-project · Aug 7, 2022 · 246293f · 246293f
2 parents bc549a7 + 2be886b
commit 246293f
Show file tree

Hide file tree

Showing 7 changed files with 66 additions and 74 deletions.
diff --git a/Code/collect_commits.py b/Code/collect_commits.py
@@ -96,7 +96,7 @@ def extract_project_links(df_master):
                     df_fixes = df_fixes.append(pd.Series(row), ignore_index=True)
 
     df_fixes = df_fixes.drop_duplicates().reset_index(drop=True)
-    cf.logger.info('Number of collected references to vulnerability fixing commits:', len(df_fixes))
+    cf.logger.info(f'Number of collected references to vulnerability fixing commits: {len(df_fixes)}')
     return df_fixes
 
 
@@ -111,7 +111,7 @@ def guess_pl(code):
 
 
 def clean_string(signature):
-    return signature.strip().replace(' ','')
+    return signature.strip().replace(' ', '')
 
 
 def get_method_code(source_code, start_line, end_line):
@@ -122,7 +122,7 @@ def get_method_code(source_code, start_line, end_line):
         else:
             return None
     except Exception as e:
-        cf.logger.warning('Problem while getting method code from the file!', e)
+        cf.logger.warning(f'Problem while extracting method code from the changed file contents: {e}')
         pass
 
 
@@ -161,19 +161,19 @@ def get_methods(file, file_change_id):
     try:
         if file.changed_methods:
             cf.logger.debug('-' * 70)
-            cf.logger.debug('\nmethods_after: ')
+            cf.logger.debug('methods_after: ')
             cf.logger.debug('- ' * 35)
             for m in file.methods:
                 if m.name != '(anonymous)':
                     cf.logger.debug(m.long_name)
 
-            cf.logger.debug('\nmethods_before: ')
+            cf.logger.debug('methods_before: ')
             cf.logger.debug('- ' * 35)
             for mb in file.methods_before:
                 if mb.name != '(anonymous)':
                     cf.logger.debug(mb.long_name)
 
-            cf.logger.debug('\nchanged_methods: ')
+            cf.logger.debug('changed_methods: ')
             cf.logger.debug('- ' * 35)
             for mc in file.changed_methods:
                 if mc.name != '(anonymous)':
@@ -186,7 +186,7 @@ def get_methods(file, file_change_id):
             #         if clean_string(mc.long_name) == clean_string(mb.long_name) and mc.name != '(anonymous)':
 
             if file.changed_methods:
-                methods_after, methods_before = changed_methods_both(file) # modified methods in source_code_after/_before
+                methods_after, methods_before = changed_methods_both(file)  # in source_code_after/_before
                 if methods_before:
                     for mb in methods_before:
                         # filtering out code not existing, and (anonymous)
@@ -241,7 +241,7 @@ def get_methods(file, file_change_id):
             return None
 
     except Exception as e:
-        cf.logger.warning('Problem while fetching the methods!', e)
+        cf.logger.warning(f'Problem while fetching the methods: {e}')
         pass
 
 
@@ -280,7 +280,6 @@ def get_files(commit):
                     'token_count': file.token_count,
                     'programming_language': programming_language,
                 }
-                file_methods = []
                 commit_files.append(file_row)
                 file_methods = get_methods(file, file_change_id)
 
@@ -292,7 +291,7 @@ def get_files(commit):
         return commit_files, commit_methods
 
     except Exception as e:
-        cf.logger.warning('Problem while fetching the files!', e)
+        cf.logger.warning(f'Problem while fetching the files: {e}')
         pass
 
 
@@ -351,13 +350,11 @@ def extract_commits(repo_url, hashes):
                 'dmm_unit_size': commit.dmm_unit_size,
             }
             commit_files, commit_methods = get_files(commit)
-
             repo_commits.append(commit_row)
             repo_files.extend(commit_files)
             repo_methods.extend(commit_methods)
-
         except Exception as e:
-            cf.logger.warning('Problem while fetching the commits!', e)
+            cf.logger.warning(f'Problem while fetching the commits: {e}')
             pass
 
     if repo_commits:
@@ -378,5 +375,4 @@ def extract_commits(repo_url, hashes):
     else:
         df_repo_methods = None
 
-
     return df_repo_commits, df_repo_files, df_repo_methods
diff --git a/Code/collect_projects.py b/Code/collect_projects.py
@@ -47,7 +47,6 @@ def filter_urls(urls):
     return non_exist_urls
 
 
-
 def convert_runtime(start_time, end_time):
     """
     converts runtime of the slice of code more readable format
@@ -59,7 +58,6 @@ def convert_runtime(start_time, end_time):
     return hours, minutes, seconds
 
 
-
 def get_ref_links():
     """
     retrieves reference links from CVE records to populate 'fixes' table
@@ -74,10 +72,9 @@ def get_ref_links():
         df_master = pd.read_sql("SELECT * FROM cve", con=db.conn)
         df_fixes = extract_project_links(df_master)
 
-        cf.logger.info('Checking if references still exist...')
+        cf.logger.info('Checking if references are still accessible...')
         unique_urls = set(list(df_fixes.repo_url))
 
-        unfetched_urls = []
         unfetched_urls = filter_urls(unique_urls)
 
         if len(unfetched_urls) > 0:
@@ -101,13 +98,12 @@ def get_ref_links():
     return df_fixes
 
 
-
 def get_github_meta(repo_url, username, token):
     """
     returns github meta-information of the repo_url
     """
     owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
-    meta_row ={}
+    meta_row = {}
 
     if username == 'None':
         git_link = Github()
@@ -128,15 +124,14 @@ def get_github_meta(repo_url, username, token):
                     'stars_count': repo.stargazers_count,
                     'owner': owner}
     except BadCredentialsException as e:
-        cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}\n', getattr(e, 'message', repr(e)))
+        cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}: {e}')
         pass  # or exit(1)
     except Exception as e:
-        cf.logger.warning(f'Other issues while getting meta-data for GitHub repository {repo_url}\n', e)
+        cf.logger.warning(f'Other issues while getting meta-data for GitHub repository {repo_url}: {e}')
         pass  # or exit(1)
     return meta_row
 
 
-
 def save_repo_meta(repo_url):
     """
     populate repository meta-information in repository table.
@@ -153,8 +148,7 @@ def save_repo_meta(repo_url):
             else:
                 df_meta.to_sql(name='repository', con=db.conn, if_exists="replace", index=False)
         except Exception as e:
-            cf.logger.warning('Problem while fetching repository meta-information\n', e)
-
+            cf.logger.warning(f'Problem while fetching repository meta-information: {e}')
 
 
 def store_tables(df_fixes):
@@ -178,8 +172,7 @@ def store_tables(df_fixes):
             df_single_repo = df_fixes[df_fixes.repo_url == repo_url]
             hashes = list(df_single_repo.hash.unique())
             cf.logger.info('-' * 70)
-            cf.logger.info('Retrieving fixes for repo', pcount, 'of', len(repo_urls),
-                  '-',  repo_url.rsplit("/")[-1])
+            cf.logger.info(f'Retrieving fixes for repo {pcount} of {len(repo_urls)} - {repo_url.rsplit("/")[-1]}')
 
             # extract_commits method returns data at different granularity levels
             df_commit, df_file, df_method = extract_commits(repo_url, hashes)
@@ -189,35 +182,33 @@ def store_tables(df_fixes):
                     # ----------------appending each project data to the tables-------------------------------
                     df_commit = df_commit.applymap(str)
                     df_commit.to_sql(name="commits", con=db.conn, if_exists="append", index=False)
-                    cf.logger.debug('#Commits :', len(df_commit))
+                    cf.logger.debug(f'#Commits: {len(df_commit)}')
 
                     if df_file is not None:
                         df_file = df_file.applymap(str)
                         df_file.to_sql(name="file_change", con=db.conn, if_exists="append", index=False)
-                        cf.logger.debug('#Files   :', len(df_file))
+                        cf.logger.debug(f'#Files: {len(df_file)}')
 
                     if df_method is not None:
                         df_method = df_method.applymap(str)
                         df_method.to_sql(name="method_change", con=db.conn, if_exists="append", index=False)
-                        cf.logger.debug('#Methods :', len(df_method))
+                        cf.logger.debug(f'#Methods: {len(df_method)}')
 
                     save_repo_meta(repo_url)
             else:
-                cf.logger.warning(f'Could not retrieve commit information from: {repo_url}\n')
+                cf.logger.warning(f'Could not retrieve commit information from: {repo_url}')
 
         except Exception as e:
-            cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}\n', e)
+            cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}: {e}')
             pass  # skip fetching repository if is not available.
 
-
     cf.logger.debug('-' * 70)
     if db.table_exists('commits'):
         commit_count = str(pd.read_sql("SELECT count(*) FROM commits", con=db.conn).iloc[0].iloc[0])
         cf.logger.debug(f'Number of commits retrieved from all the repos: {commit_count}')
     else:
         cf.logger.warning('The commits table does not exist')
 
-
     if db.table_exists('file_change'):
         file_count = str(pd.read_sql("SELECT count(*) from file_change;", con=db.conn).iloc[0].iloc[0])
         cf.logger.debug(f'Number of files changed by all the commits: {file_count}')
@@ -247,7 +238,7 @@ def store_tables(df_fixes):
     if db.table_exists('method_change'):
         prune_tables(cf.DATABASE)
     else:
-        cf.logger.warning('Data pruning is not possible because there is not information in method_change table')
+        cf.logger.warning('Data pruning is not possible because there is no information in method_change table')
 
     cf.logger.info('The database is up-to-date.')
     cf.logger.info('-' * 70)

diff --git a/Code/configuration.py b/Code/configuration.py
@@ -16,17 +16,17 @@
 DATABASE = Path(DATA_PATH) / DATABASE_NAME
 config_read = False
 
-log_level_map = { 'DEBUG': logging.DEBUG,
-                  'INFO': logging.INFO,
-                  'WARNING': logging.WARNING,
-                  'ERROR': logging.ERROR,
-                  'CRITICAL': logging.CRITICAL,
-                }
+log_level_map = {'DEBUG': logging.DEBUG,
+                 'INFO': logging.INFO,
+                 'WARNING': logging.WARNING,
+                 'ERROR': logging.ERROR,
+                 'CRITICAL': logging.CRITICAL
+                 }
 
 logging.basicConfig(level=LOGGING_LEVEL,
-                    format='%(asctime)s %(levelname)-3s: %(message)s',
+                    format='%(asctime)s %(name)s %(levelname)s %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S')
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('CVEfixes')
 logger.removeHandler(sys.stderr)
 
 
@@ -36,7 +36,7 @@ def read_config() -> None:
 
     Sets global constants with values found in the ini file.
     """
-    global DATA_PATH, DATABASE_NAME, DATABASE, USER, TOKEN, SAMPLE_LIMIT, NUM_WORKERS, config_read
+    global DATA_PATH, DATABASE_NAME, DATABASE, USER, TOKEN, SAMPLE_LIMIT, NUM_WORKERS, LOGGING_LEVEL, config_read
 
     config = ConfigParser()
     if config.read(['.CVEfixes.ini',
@@ -62,12 +62,10 @@ def read_config() -> None:
     read_config()
     logger.setLevel(LOGGING_LEVEL)
     logging.getLogger("requests").setLevel(LOGGING_LEVEL)
-    logging.getLogger("urllib3").setLevel(LOGGING_LEVEL)
-    logging.getLogger("urllib3.connection").setLevel(LOGGING_LEVEL)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+    logging.getLogger("urllib3.connection").setLevel(logging.WARNING)
     logging.getLogger("pathlib").setLevel(LOGGING_LEVEL)
     logging.getLogger("subprocess").setLevel(LOGGING_LEVEL)
-    logging.getLogger("h5py._conv").setLevel(LOGGING_LEVEL)
+    logging.getLogger("h5py._conv").setLevel(logging.WARNING)
     logging.getLogger("git.cmd").setLevel(LOGGING_LEVEL)
     logging.getLogger("github.Requester").setLevel(LOGGING_LEVEL)
-
-
diff --git a/Code/cve_importer.py b/Code/cve_importer.py
@@ -118,7 +118,7 @@ def import_cves():
 
             # Check if the directory already has the json file or not ?
             if os.path.isfile(Path(cf.DATA_PATH) / 'json' / extract_target):
-                cf.logger.warning('Reusing', year, 'CVE json file that was downloaded earlier...')
+                cf.logger.warning(f'Reusing the {year} CVE json file that was downloaded earlier...')
                 json_file = Path(cf.DATA_PATH) / 'json' / extract_target
             else:
                 # url_to_open = urlopen(zip_file_url, timeout=10)
@@ -132,7 +132,7 @@ def import_cves():
                     df_cve = pd.DataFrame(yearly_data)
                 else:
                     df_cve = df_cve.append(pd.DataFrame(yearly_data))
-                cf.logger.info(str(year), 'CVE json file has been merged')
+                cf.logger.info(f'The CVE json for {year} has been merged')
 
         df_cve = preprocess_jsons(df_cve)
         df_cve = df_cve.applymap(str)
@@ -157,7 +157,7 @@ def import_cves():
 
         no_ref_cwes = set(list(df_cwes_class.cwe_id)).difference(set(list(df_cwes.cwe_id)))
         if len(no_ref_cwes) > 0:
-            cf.logger.debug('List of CWEs from CVEs that are not associated to cwe table are as follows:- ')
+            cf.logger.debug('List of CWEs from CVEs that are not associated to cwe table are as follows:')
             cf.logger.debug(no_ref_cwes)
 
         # Applying the assertion to cve-, cwe- and cwe_classification table.

diff --git a/Code/extract_cwe_record.py b/Code/extract_cwe_record.py
@@ -1,14 +1,14 @@
 import ast
 import json
-import os
 import time
+import fnmatch
 import xml.etree.ElementTree as et
 import pandas as pd
+from pathlib import Path
 from io import BytesIO
 from urllib.request import urlopen
 from zipfile import ZipFile
 from pandas import json_normalize
-
 import configuration as cf
 
 # --------------------------------------------------------------------------------------------------------
@@ -20,22 +20,27 @@ def extract_cwe():
     :return df_CWE: dataframe of CWE category table
     """
 
-    if os.path.isdir(cf.DATA_PATH + "cwec_v4.4.xml"):
-        cf.logger.info("Reusing the CWE XML file that is already in the directory")
-        xtree = et.parse(cf.DATA_PATH + "cwec_v4.4.xml")
+    cwe_doc = sorted(Path(cf.DATA_PATH).glob('cwec_*.xml'))
+    if len(cwe_doc) > 0:
+        cf.logger.info('Reusing the CWE XML file that is already in the directory')
+        xtree = et.parse(cf.DATA_PATH + cwe_doc[-1])
     else:
-        cwe_csv_url = "https://cwe.mitre.org/data/xml/cwec_latest.xml.zip"
+        cwe_csv_url = 'https://cwe.mitre.org/data/xml/cwec_latest.xml.zip'
         cwe_zip = ZipFile(BytesIO(urlopen(cwe_csv_url).read()))
-        cwefile = cwe_zip.extract("cwec_v4.4.xml", cf.DATA_PATH)
-        xtree = et.parse(cwefile)
+        cwe_doc = sorted(fnmatch.filter(cwe_zip.namelist(),'cwec_*.xml'))  # assumes all files at top level
+        assert len(cwe_doc) > 0, \
+            'Cannot find a CWE XML file in https://cwe.mitre.org/data/xml/cwec_latest.xml.zip'
+        cf.logger.info(f'Extracting CWE data from {cwe_doc[-1]}')
+        cwe_file = cwe_zip.extract(cwe_doc[-1], cf.DATA_PATH)
+        xtree = et.parse(cwe_file)
         time.sleep(2)
 
     xroot = xtree.getroot()
     cat_flag = 0
     rows = []
 
-    for parents in xroot[0:2]:  # taking only 0, 1 and 2 (index 0 is for weaknesses, 1 for Categories, 2 for Views, 3 for External_References)
-
+    # include only types 0, 1 and 2 (0 is for weaknesses, 1 for Categories, 2 for Views, 3 for External_References)
+    for parents in xroot[0:2]:
         for node in parents:
             cwe_id = 'CWE-' + str(node.attrib['ID'])
             cwe_name = node.attrib['Name'] if node.attrib['Name'] is not None else None