certtools · aaronkaplan · Sep 14, 2016 · Aug 9, 2016 · Aug 9, 2016 · Aug 9, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -13,6 +13,7 @@ install:
   - pip install codecov
   - pip install pep8
   - sudo pip install .
+  - sudo cp /opt/intelmq/etc/examples/* /opt/intelmq/etc/
 script:
   - nosetests --with-coverage --cover-package=intelmq
   - dpkg-buildpackage -us -uc

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,6 +31,7 @@ v1.0 (in development, master branch)
 - ENH: Additional data types: integer, float and Boolean
 - ENH: Added descriptions and matching types to all fields
 - DOC: harmonization documentation has same fields as configuration, docs are generated from configuration
+- ENH: New type LowercaseString and UppercaseString
 
 #### Most important changes:
 - `(source|destination).bgp_prefix` is now `(source|destination).network`

diff --git a/docs/Harmonization-fields.md b/docs/Harmonization-fields.md
diff --git a/intelmq/bin/intelmq_gen_harm_docs.py b/intelmq/bin/intelmq_gen_harm_docs.py
@@ -6,45 +6,58 @@
 import textwrap
 
 import intelmq.lib.harmonization
-from intelmq import HARMONIZATION_CONF_FILE
+import pkg_resources
 
-print("""
+HEADER = """
 Harmonization field names
 =========================
 
 |Section|Name|Type|Description|
-|:------|:---|:---|:----------|""")
-
-
-with open(HARMONIZATION_CONF_FILE) as fhandle:
-    HARM = json.load(fhandle)['event']
-
-for key, value in sorted(HARM.items()):
-    section = ' '.join([sec.title() for sec in key.split('.')[:-1]])
-    print('|{}|{}|{}|{}|'.format(section, key, value['type'],
-                                 value['description']))
-
-print("""
+|:------|:---|:---|:----------|
+"""
+HEADER_1 = """
 
 Harmonization types
 -------------------
 
-""")
-
-for value in sorted(dir(intelmq.lib.harmonization)):
-    if value == 'GenericType' or value.startswith('__'):
-        continue
-    obj = getattr(intelmq.lib.harmonization, value)
-    try:
-        if issubclass(obj, intelmq.lib.harmonization.GenericType):
-            doc = getattr(obj, '__doc__', '')
-            if doc is None:
-                doc = ''
-            else:
-                doc = textwrap.dedent(doc)
-            print("""### {}
+"""
+TYPE_SECTION = """### {}
 {}
 
-""".format(value, doc))
-    except TypeError:
-        pass
+"""
+
+
+def main():
+    output = HEADER
+
+    HARM_CONF = pkg_resources.resource_filename('intelmq', 'etc/harmonization.conf')
+    with open(HARM_CONF) as fhandle:
+        HARM = json.load(fhandle)['event']
+
+    for key, value in sorted(HARM.items()):
+        section = ' '.join([sec.title() for sec in key.split('.')[:-1]])
+        output += '|{}|{}|[{}](#{})|{}|\n'.format(section, key, value['type'],
+                                                  value['type'].lower(),
+                                                  value['description'])
+
+    output += HEADER_1
+
+    for value in sorted(dir(intelmq.lib.harmonization)):
+        if value == 'GenericType' or value.startswith('__'):
+            continue
+        obj = getattr(intelmq.lib.harmonization, value)
+        try:
+            if issubclass(obj, intelmq.lib.harmonization.GenericType):
+                doc = getattr(obj, '__doc__', '')
+                if doc is None:
+                    doc = ''
+                else:
+                    doc = textwrap.dedent(doc)
+                output += TYPE_SECTION.format(value, doc)
+        except TypeError:
+            pass
+
+    return output
+
+if __name__ == '__main__':
+    print(main())
diff --git a/intelmq/bin/intelmq_psql_initdb.py b/intelmq/bin/intelmq_psql_initdb.py
@@ -5,16 +5,18 @@
 
 Reads the harmonization configuration from
 `/opt/intelmq/etc/harmonization.conf` and generates an SQL command from it.
-The SQL file is saved in `/tmp/initdb.sql`.
+The SQL file is saved in `/tmp/initdb.sql` or a temporary name if the other one
+exists.
 """
 import json
+import os
 import sys
+import tempfile
 
 from intelmq import HARMONIZATION_CONF_FILE
 
 
 def main():
-    OUTPUTFILE = "/tmp/initdb.sql"
     FIELDS = dict()
 
     try:
@@ -30,8 +32,12 @@ def main():
         value = DATA[field]
 
         if value['type'] in ('String', 'Base64', 'URL', 'FQDN',
-                             'MalwareName', 'ClassificationType'):
-            dbtype = 'varchar({})'.format(value.get('length', 2000))
+                             'MalwareName', 'ClassificationType',
+                             'LowercaseString', 'UppercaseString', 'Registry'):
+            if 'length' in value:
+                dbtype = 'varchar({})'.format(value['length'])
+            else:
+                dbtype = 'text'
         elif value['type'] in ('IPAddress', 'IPNetwork'):
             dbtype = 'inet'
         elif value['type'] == 'DateTime':
@@ -47,23 +53,34 @@ def main():
         elif value['type'] == 'JSON':
             dbtype = 'json'
         else:
-            print('Unknow type {!r}, assuming varchar(2000) by default'
-                  ''.format(value['type']))
-            dbtype = 'varchar(2000)'
+            raise ValueError('Unknow type %r.' % value['type'])
 
         FIELDS[field] = dbtype
 
-    initdb = """CREATE table events (
-        "id" BIGSERIAL UNIQUE PRIMARY KEY,"""
+    initdb = """CREATE TABLE events (
+    "id" BIGSERIAL UNIQUE PRIMARY KEY,"""
     for field, field_type in sorted(FIELDS.items()):
         initdb += '\n    "{name}" {type},'.format(name=field, type=field_type)
 
     initdb = initdb[:-1]  # remove last ','
     initdb += "\n);"
+    return initdb
 
-    with open(OUTPUTFILE, 'w') as fp:
-        print("INFO - Writing %s file" % OUTPUTFILE)
-        fp.write(initdb)
 
 if __name__ == '__main__':
-    main()
+    OUTPUTFILE = "/tmp/initdb.sql"
+    fp = None
+    try:
+        if os.path.exists(OUTPUTFILE):
+            print('INFO - File {} exists, generating temporary file.'.format(OUTPUTFILE))
+            os_fp, OUTPUTFILE = tempfile.mkstemp(suffix='.initdb.sql',
+                                                 text=True)
+            fp = os.fdopen(os_fp, 'wt')
+        else:
+            fp = open(OUTPUTFILE, 'wt')
+        psql = main()
+        print("INFO - Writing %s file" % OUTPUTFILE)
+        fp.write(psql)
+    finally:
+        if fp:
+            fp.close()
diff --git a/intelmq/etc/harmonization.conf b/intelmq/etc/harmonization.conf
@@ -6,6 +6,7 @@
         },
         "classification.taxonomy": {
             "description": "We recognize the need for the CSIRT teams to apply a static (incident) taxonomy to abuse data. With this goal in mind the type IOC will serve as a basis for this activity. Each value of the dynamic type mapping translates to a an element in the static taxonomy. The European CSIRT teams for example have decided to apply the eCSIRT.net incident classification. The value of the taxonomy key is thus a derivative of the dynamic type above. For more information about check [ENISA taxonomies](http://www.enisa.europa.eu/activities/cert/support/incident-management/browsable/incident-handling-process/incident-taxonomy/existing-taxonomies).",
+            "length": 100,
             "type": "String"
         },
         "classification.type": {
@@ -43,8 +44,9 @@
         },
         "destination.geolocation.cc": {
             "description": "Country-Code accoriding to ISO3166-1 alpha-2 for the destination IP.",
+            "length": 2,
             "regex": "^[a-zA-Z0-9]{2}$",
-            "type": "String"
+            "type": "UppercaseString"
         },
         "destination.geolocation.city": {
             "description": "Some geolocation services refer to city-level geolocation.",
@@ -92,7 +94,8 @@
         },
         "destination.registry": {
             "description": "The IP registry a given ip address is allocated by.",
-            "type": "String"
+            "length": 7,
+            "type": "Registry"
         },
         "destination.reverse_dns": {
             "description": "Reverse DNS name acquired through a reverse DNS query on an IP address. N.B. Record types other than PTR records may also appear in the reverse DNS tree. Furthermore, unfortunately, there is no rule prohibiting people from writing anything in a PTR record. Even Javascript will work. A final point is stripped, string is converted to lower case characters.",
@@ -121,8 +124,9 @@
         },
         "event_hash": {
             "description": "Computed event hash with specific keys and values that identify a unique event. At present, the hash should default to using the SHA1 function. Please note that for an event hash to be able to match more than one event (deduplication) the receiver of an event should calculate it based on a minimal set of keys and values present in the event. Using for example the observation time in the calculation will most likely render the checksum useless for deduplication purposes.",
-            "regex": "^[a-f0-9$]+$",
-            "type": "LowercaseString"
+            "length": 40,
+            "regex": "^[A-F0-9./]+$",
+            "type": "UppercaseString"
         },
         "extra": {
             "description": "All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.",
@@ -146,19 +150,22 @@
             "type": "URL"
         },
         "malware.hash": {
-            "description": "A string depicting a checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
-            "regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
-            "type": "LowercaseString"
+            "description": "A string depicting a checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
+            "length": 200,
+            "regex": "^[ -~]+$",
+            "type": "String"
         },
         "malware.hash.md5": {
-            "description": "A string depicting a MD5 checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
-            "regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
-            "type": "LowercaseString"
+            "description": "A string depicting an MD5 checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
+            "length": 200,
+            "regex": "^[ -~]+$",
+            "type": "String"
         },
         "malware.hash.sha1": {
-            "description": "A string depicting a SHA1 checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
-            "regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
-            "type": "LowercaseString"
+            "description": "A string depicting a SHA1 checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
+            "length": 200,
+            "regex": "^[ -~]+$",
+            "type": "String"
         },
         "malware.name": {
             "description": "A malware family name in lower case.",
@@ -172,23 +179,26 @@
         },
         "misp.attribute_uuid": {
             "description": "MISP - Malware Information Sharing Platform & Threat Sharing UUID of an attribute.",
+            "length": 36,
             "regex": "^[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}$",
             "type": "LowercaseString"
         },
         "misp.event_uuid": {
             "description": "MISP - Malware Information Sharing Platform & Threat Sharing UUID.",
+            "length": 36,
             "regex": "^[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[0-9a-z]{12}$",
             "type": "LowercaseString"
         },
         "protocol.application": {
             "description": "e.g. vnc, ssh, sip, irc, http or p2p.",
-            "length": 50,
+            "length": 100,
             "regex": "^[ -~]+$",
             "type": "LowercaseString"
         },
         "protocol.transport": {
             "description": "e.g. tcp, udp, icmp.",
             "iregex": "^ip|icmp|igmp|ggp|ipencap|st2|tcp|cbt|egp|igp|bbn-rcc|nvp|pup|argus|emcon|xnet|chaos|udp|mux|dcn|hmp|prm|xns-idp|trunk-1|trunk-2|leaf-1|leaf-2|rdp|irtp|iso-tp4|netblt|mfe-nsp|merit-inp|sep|3pc|idpr|xtp|ddp|idpr-cmtp|tp\\+\\+|il|ipv6|sdrp|ipv6-route|ipv6-frag|idrp|rsvp|gre|mhrp|bna|esp|ah|i-nlsp|swipe|narp|mobile|tlsp|skip|ipv6-icmp|ipv6-nonxt|ipv6-opts|cftp|sat-expak|kryptolan|rvd|ippc|sat-mon|visa|ipcv|cpnx|cphb|wsn|pvp|br-sat-mon|sun-nd|wb-mon|wb-expak|iso-ip|vmtp|secure-vmtp|vines|ttp|nsfnet-igp|dgp|tcf|eigrp|ospf|sprite-rpc|larp|mtp|ax.25|ipip|micp|scc-sp|etherip|encap|gmtp|ifmp|pnni|pim|aris|scps|qnx|a/n|ipcomp|snp|compaq-peer|ipx-in-ip|vrrp|pgm|l2tp|ddx|iatp|st|srp|uti|smp|sm|ptp|isis|fire|crtp|crdup|sscopmce|iplt|sps|pipe|sctp|fc|divert$",
+            "length": 11,
             "type": "LowercaseString"
         },
         "raw": {
@@ -230,8 +240,9 @@
         },
         "source.geolocation.cc": {
             "description": "Country-Code accoriding to ISO3166-1 alpha-2 for the source IP.",
+            "length": 2,
             "regex": "^[a-zA-Z0-9]{2}$",
-            "type": "String"
+            "type": "UppercaseString"
         },
         "source.geolocation.city": {
             "description": "Some geolocation services refer to city-level geolocation.",
@@ -243,13 +254,15 @@
         },
         "source.geolocation.cymru_cc": {
             "description": "The country code denoted for the ip by the Team Cymru asn to ip mapping service.",
+            "length": 2,
             "regex": "^[a-zA-Z0-9]{2}$",
-            "type": "String"
+            "type": "UppercaseString"
         },
         "source.geolocation.geoip_cc": {
             "description": "MaxMind Country Code (ISO3166-1 alpha-2).",
+            "length": 2,
             "regex": "^[a-zA-Z0-9]{2}$",
-            "type": "String"
+            "type": "UppercaseString"
         },
         "source.geolocation.latitude": {
             "description": "Latitude coordinates derived from a geolocation service, such as MaxMind geoip db.",
@@ -290,7 +303,8 @@
         },
         "source.registry": {
             "description": "The IP registry a given ip address is allocated by.",
-            "type": "String"
+            "length": 7,
+            "type": "Registry"
         },
         "source.reverse_dns": {
             "description": "Reverse DNS name acquired through a reverse DNS query on an IP address. N.B. Record types other than PTR records may also appear in the reverse DNS tree. Furthermore, unfortunately, there is no rule prohibiting people from writing anything in a PTR record. Even Javascript will work. A final point is stripped, string is converted to lower case characters.",
@@ -324,7 +338,7 @@
             "type": "Accuracy"
         },
         "feed.code": {
-            "description": "Code name for the feed, e.g.  DFGS, HSDAG etc.",
+            "description": "Code name for the feed, e.g. DFGS, HSDAG etc.",
             "length": 100,
             "type": "String"
         },