Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harmonization fixes #634

Merged
merged 19 commits into from
Sep 14, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ install:
- pip install codecov
- pip install pep8
- sudo pip install .
- sudo cp /opt/intelmq/etc/examples/* /opt/intelmq/etc/
script:
- nosetests --with-coverage --cover-package=intelmq
- dpkg-buildpackage -us -uc
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ v1.0 (in development, master branch)
- ENH: Additional data types: integer, float and Boolean
- ENH: Added descriptions and matching types to all fields
- DOC: harmonization documentation has same fields as configuration, docs are generated from configuration
- ENH: New type LowercaseString and UppercaseString

#### Most important changes:
- `(source|destination).bgp_prefix` is now `(source|destination).network`
Expand Down
171 changes: 84 additions & 87 deletions docs/Harmonization-fields.md

Large diffs are not rendered by default.

75 changes: 44 additions & 31 deletions intelmq/bin/intelmq_gen_harm_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,58 @@
import textwrap

import intelmq.lib.harmonization
from intelmq import HARMONIZATION_CONF_FILE
import pkg_resources

print("""
HEADER = """
Harmonization field names
=========================

|Section|Name|Type|Description|
|:------|:---|:---|:----------|""")


with open(HARMONIZATION_CONF_FILE) as fhandle:
HARM = json.load(fhandle)['event']

for key, value in sorted(HARM.items()):
section = ' '.join([sec.title() for sec in key.split('.')[:-1]])
print('|{}|{}|{}|{}|'.format(section, key, value['type'],
value['description']))

print("""
|:------|:---|:---|:----------|
"""
HEADER_1 = """

Harmonization types
-------------------

""")

for value in sorted(dir(intelmq.lib.harmonization)):
if value == 'GenericType' or value.startswith('__'):
continue
obj = getattr(intelmq.lib.harmonization, value)
try:
if issubclass(obj, intelmq.lib.harmonization.GenericType):
doc = getattr(obj, '__doc__', '')
if doc is None:
doc = ''
else:
doc = textwrap.dedent(doc)
print("""### {}
"""
TYPE_SECTION = """### {}
{}

""".format(value, doc))
except TypeError:
pass
"""


def main():
output = HEADER

HARM_CONF = pkg_resources.resource_filename('intelmq', 'etc/harmonization.conf')
with open(HARM_CONF) as fhandle:
HARM = json.load(fhandle)['event']

for key, value in sorted(HARM.items()):
section = ' '.join([sec.title() for sec in key.split('.')[:-1]])
output += '|{}|{}|[{}](#{})|{}|\n'.format(section, key, value['type'],
value['type'].lower(),
value['description'])

output += HEADER_1

for value in sorted(dir(intelmq.lib.harmonization)):
if value == 'GenericType' or value.startswith('__'):
continue
obj = getattr(intelmq.lib.harmonization, value)
try:
if issubclass(obj, intelmq.lib.harmonization.GenericType):
doc = getattr(obj, '__doc__', '')
if doc is None:
doc = ''
else:
doc = textwrap.dedent(doc)
output += TYPE_SECTION.format(value, doc)
except TypeError:
pass

return output

if __name__ == '__main__':
print(main())
43 changes: 30 additions & 13 deletions intelmq/bin/intelmq_psql_initdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@

Reads the harmonization configuration from
`/opt/intelmq/etc/harmonization.conf` and generates an SQL command from it.
The SQL file is saved in `/tmp/initdb.sql`.
The SQL file is saved in `/tmp/initdb.sql` or a temporary name if the other one
exists.
"""
import json
import os
import sys
import tempfile

from intelmq import HARMONIZATION_CONF_FILE


def main():
OUTPUTFILE = "/tmp/initdb.sql"
FIELDS = dict()

try:
Expand All @@ -30,8 +32,12 @@ def main():
value = DATA[field]

if value['type'] in ('String', 'Base64', 'URL', 'FQDN',
'MalwareName', 'ClassificationType'):
dbtype = 'varchar({})'.format(value.get('length', 2000))
'MalwareName', 'ClassificationType',
'LowercaseString', 'UppercaseString', 'Registry'):
if 'length' in value:
dbtype = 'varchar({})'.format(value['length'])
else:
dbtype = 'text'
elif value['type'] in ('IPAddress', 'IPNetwork'):
dbtype = 'inet'
elif value['type'] == 'DateTime':
Expand All @@ -47,23 +53,34 @@ def main():
elif value['type'] == 'JSON':
dbtype = 'json'
else:
print('Unknow type {!r}, assuming varchar(2000) by default'
''.format(value['type']))
dbtype = 'varchar(2000)'
raise ValueError('Unknow type %r.' % value['type'])

FIELDS[field] = dbtype

initdb = """CREATE table events (
"id" BIGSERIAL UNIQUE PRIMARY KEY,"""
initdb = """CREATE TABLE events (
"id" BIGSERIAL UNIQUE PRIMARY KEY,"""
for field, field_type in sorted(FIELDS.items()):
initdb += '\n "{name}" {type},'.format(name=field, type=field_type)

initdb = initdb[:-1] # remove last ','
initdb += "\n);"
return initdb

with open(OUTPUTFILE, 'w') as fp:
print("INFO - Writing %s file" % OUTPUTFILE)
fp.write(initdb)

if __name__ == '__main__':
main()
OUTPUTFILE = "/tmp/initdb.sql"
fp = None
try:
if os.path.exists(OUTPUTFILE):
print('INFO - File {} exists, generating temporary file.'.format(OUTPUTFILE))
os_fp, OUTPUTFILE = tempfile.mkstemp(suffix='.initdb.sql',
text=True)
fp = os.fdopen(os_fp, 'wt')
else:
fp = open(OUTPUTFILE, 'wt')
psql = main()
print("INFO - Writing %s file" % OUTPUTFILE)
fp.write(psql)
finally:
if fp:
fp.close()
52 changes: 33 additions & 19 deletions intelmq/etc/harmonization.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
},
"classification.taxonomy": {
"description": "We recognize the need for the CSIRT teams to apply a static (incident) taxonomy to abuse data. With this goal in mind the type IOC will serve as a basis for this activity. Each value of the dynamic type mapping translates to a an element in the static taxonomy. The European CSIRT teams for example have decided to apply the eCSIRT.net incident classification. The value of the taxonomy key is thus a derivative of the dynamic type above. For more information about check [ENISA taxonomies](http://www.enisa.europa.eu/activities/cert/support/incident-management/browsable/incident-handling-process/incident-taxonomy/existing-taxonomies).",
"length": 100,
"type": "String"
},
"classification.type": {
Expand Down Expand Up @@ -43,8 +44,9 @@
},
"destination.geolocation.cc": {
"description": "Country-Code accoriding to ISO3166-1 alpha-2 for the destination IP.",
"length": 2,
"regex": "^[a-zA-Z0-9]{2}$",
"type": "String"
"type": "UppercaseString"
},
"destination.geolocation.city": {
"description": "Some geolocation services refer to city-level geolocation.",
Expand Down Expand Up @@ -92,7 +94,8 @@
},
"destination.registry": {
"description": "The IP registry a given ip address is allocated by.",
"type": "String"
"length": 7,
"type": "Registry"
},
"destination.reverse_dns": {
"description": "Reverse DNS name acquired through a reverse DNS query on an IP address. N.B. Record types other than PTR records may also appear in the reverse DNS tree. Furthermore, unfortunately, there is no rule prohibiting people from writing anything in a PTR record. Even Javascript will work. A final point is stripped, string is converted to lower case characters.",
Expand Down Expand Up @@ -121,8 +124,9 @@
},
"event_hash": {
"description": "Computed event hash with specific keys and values that identify a unique event. At present, the hash should default to using the SHA1 function. Please note that for an event hash to be able to match more than one event (deduplication) the receiver of an event should calculate it based on a minimal set of keys and values present in the event. Using for example the observation time in the calculation will most likely render the checksum useless for deduplication purposes.",
"regex": "^[a-f0-9$]+$",
"type": "LowercaseString"
"length": 40,
"regex": "^[A-F0-9./]+$",
"type": "UppercaseString"
},
"extra": {
"description": "All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc. **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.",
Expand All @@ -146,19 +150,22 @@
"type": "URL"
},
"malware.hash": {
"description": "A string depicting a checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
"regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
"type": "LowercaseString"
"description": "A string depicting a checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
"length": 200,
"regex": "^[ -~]+$",
"type": "String"
},
"malware.hash.md5": {
"description": "A string depicting a MD5 checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
"regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
"type": "LowercaseString"
"description": "A string depicting an MD5 checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
"length": 200,
"regex": "^[ -~]+$",
"type": "String"
},
"malware.hash.sha1": {
"description": "A string depicting a SHA1 checksum for a file, be it a malware sample for example. Includes hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29",
"regex": "^(\\$[a-z0-9=$.-]+\\$)?[a-f0-9./]+$",
"type": "LowercaseString"
"description": "A string depicting a SHA1 checksum for a file, be it a malware sample for example. You may include the hash type according to https://en.wikipedia.org/wiki/Crypt_%28C%29 and use only printable characters. Please see https://github.com/certtools/intelmq/pull/634 for a discussion on this issue.",
"length": 200,
"regex": "^[ -~]+$",
"type": "String"
},
"malware.name": {
"description": "A malware family name in lower case.",
Expand All @@ -172,23 +179,26 @@
},
"misp.attribute_uuid": {
"description": "MISP - Malware Information Sharing Platform & Threat Sharing UUID of an attribute.",
"length": 36,
"regex": "^[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}$",
"type": "LowercaseString"
},
"misp.event_uuid": {
"description": "MISP - Malware Information Sharing Platform & Threat Sharing UUID.",
"length": 36,
"regex": "^[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[0-9a-z]{12}$",
"type": "LowercaseString"
},
"protocol.application": {
"description": "e.g. vnc, ssh, sip, irc, http or p2p.",
"length": 50,
"length": 100,
"regex": "^[ -~]+$",
"type": "LowercaseString"
},
"protocol.transport": {
"description": "e.g. tcp, udp, icmp.",
"iregex": "^ip|icmp|igmp|ggp|ipencap|st2|tcp|cbt|egp|igp|bbn-rcc|nvp|pup|argus|emcon|xnet|chaos|udp|mux|dcn|hmp|prm|xns-idp|trunk-1|trunk-2|leaf-1|leaf-2|rdp|irtp|iso-tp4|netblt|mfe-nsp|merit-inp|sep|3pc|idpr|xtp|ddp|idpr-cmtp|tp\\+\\+|il|ipv6|sdrp|ipv6-route|ipv6-frag|idrp|rsvp|gre|mhrp|bna|esp|ah|i-nlsp|swipe|narp|mobile|tlsp|skip|ipv6-icmp|ipv6-nonxt|ipv6-opts|cftp|sat-expak|kryptolan|rvd|ippc|sat-mon|visa|ipcv|cpnx|cphb|wsn|pvp|br-sat-mon|sun-nd|wb-mon|wb-expak|iso-ip|vmtp|secure-vmtp|vines|ttp|nsfnet-igp|dgp|tcf|eigrp|ospf|sprite-rpc|larp|mtp|ax.25|ipip|micp|scc-sp|etherip|encap|gmtp|ifmp|pnni|pim|aris|scps|qnx|a/n|ipcomp|snp|compaq-peer|ipx-in-ip|vrrp|pgm|l2tp|ddx|iatp|st|srp|uti|smp|sm|ptp|isis|fire|crtp|crdup|sscopmce|iplt|sps|pipe|sctp|fc|divert$",
"length": 11,
"type": "LowercaseString"
},
"raw": {
Expand Down Expand Up @@ -230,8 +240,9 @@
},
"source.geolocation.cc": {
"description": "Country-Code accoriding to ISO3166-1 alpha-2 for the source IP.",
"length": 2,
"regex": "^[a-zA-Z0-9]{2}$",
"type": "String"
"type": "UppercaseString"
},
"source.geolocation.city": {
"description": "Some geolocation services refer to city-level geolocation.",
Expand All @@ -243,13 +254,15 @@
},
"source.geolocation.cymru_cc": {
"description": "The country code denoted for the ip by the Team Cymru asn to ip mapping service.",
"length": 2,
"regex": "^[a-zA-Z0-9]{2}$",
"type": "String"
"type": "UppercaseString"
},
"source.geolocation.geoip_cc": {
"description": "MaxMind Country Code (ISO3166-1 alpha-2).",
"length": 2,
"regex": "^[a-zA-Z0-9]{2}$",
"type": "String"
"type": "UppercaseString"
},
"source.geolocation.latitude": {
"description": "Latitude coordinates derived from a geolocation service, such as MaxMind geoip db.",
Expand Down Expand Up @@ -290,7 +303,8 @@
},
"source.registry": {
"description": "The IP registry a given ip address is allocated by.",
"type": "String"
"length": 7,
"type": "Registry"
},
"source.reverse_dns": {
"description": "Reverse DNS name acquired through a reverse DNS query on an IP address. N.B. Record types other than PTR records may also appear in the reverse DNS tree. Furthermore, unfortunately, there is no rule prohibiting people from writing anything in a PTR record. Even Javascript will work. A final point is stripped, string is converted to lower case characters.",
Expand Down Expand Up @@ -324,7 +338,7 @@
"type": "Accuracy"
},
"feed.code": {
"description": "Code name for the feed, e.g. DFGS, HSDAG etc.",
"description": "Code name for the feed, e.g. DFGS, HSDAG etc.",
"length": 100,
"type": "String"
},
Expand Down
Loading