OISF · regit · Oct 19, 2023 · Oct 3, 2024 · Oct 3, 2024 · Oct 13, 2024
@@ -3,8 +3,8 @@
 Datasets
 ========
 
-Using the ``dataset`` and ``datarep`` keyword it is possible to match on
-large amounts of data against any sticky buffer.
+Using the ``dataset`` and ``datarep`` and ``datajson`` keyword it is possible
+to match on large amounts of data against any sticky buffer.
 
 For example, to match against a DNS black list called ``dns-bl``::
 
@@ -145,6 +145,26 @@ reputation lists. A MD5 list, a SHA256 list, and a raw string (buffer) list.
 The rules will only match if the data is in the list and the reputation
 value is higher than 200.
 
+datajson
+~~~~~~~~
+
+DataJSON allows matching data against a set and output data attached to the matching
+value in the event.
+
+Syntax::
+
+    datajson:<cmd>,<name>,<options>;
+
+    datajson:<isset|isnotset>,<name> \
+        [, type <string|md5|sha256|ipv4|ip>, load <file name>, memcap <size>, hashsize <size>, key <json_key>];
+
+Example rules could look like::
+
+    alert http any any -> any any (msg:"IP match"; ip.dst; datajson:isset,bad_ips, type ip, load bad_ips.csv, key bad_ones; sid:8000001;)
+
+In this example, the match will occur if the destination IP is in the set and the
+alert will have an ``alert.extra.bad_ones`` subobject that will contain the JSON
+data associated to the value.
 
 Rule Reloads
 ------------
@@ -243,6 +263,44 @@ Syntax::
 
     dataset-dump
 
+datajson-add
+~~~~~~~~~~~~
+
+Unix Socket command to add data to a set. On success, the addition becomes
+active instantly.
+
+Syntax::
+
+    datajson-add <set name> <set type> <data> <json_info>
+
+set name
+  Name of an already defined dataset
+type
+  Data type: string, md5, sha256, ipv4, ip
+data
+  Data to add in serialized form (base64 for string, hex notation for md5/sha256, string representation for ipv4/ip)
+
+Example adding 'google.com' to set 'myset'::
+
+    datajson-add myset string Z29vZ2xlLmNvbQ== {"city":"Mountain View"}
+
+datajson-remove
+~~~~~~~~~~~~~~~
+
+Unix Socket command to remove data from a set. On success, the removal becomes
+active instantly.
+
+Syntax::
+
+    datajson-remove <set name> <set type> <data>
+
+set name
+  Name of an already defined dataset
+type
+  Data type: string, md5, sha256, ipv4, ip
+data
+  Data to remove in serialized form (base64 for string, hex notation for md5/sha256, string representation for ipv4/ip)
+
 File formats
 ------------
 
@@ -285,13 +343,30 @@ which when piped to ``base64 -d`` reveals its value::
 datarep
 ~~~~~~~
 
-The datarep format follows the dataset, expect that there are 1 more CSV
+The datarep format follows the dataset, except that there are 1 more CSV
 field:
 
 Syntax::
 
     <data>,<value>
 
+
+datajson
+~~~~~~~~
+
+The datajson format follows the dataset, except that there is a comma
+separator followed by a second field that must contain a valid JSON
+object:
+
+Syntax::
+
+    <data>,<json_data>
+
+e.g. for ua-seen with type string::
+
+    TW96aWxsYS80LjAgKGNvbXBhdGlibGU7ICk=,{"agent": "Mozilla", "version": "4.0"}
+
+
 .. _datasets_file_locations:
 
 File Locations

@@ -774,6 +774,67 @@ qualities of pcre as well.  These are:
 .. note:: The following characters must be escaped inside the content:
              ``;`` ``\`` ``"``
 
+PCRE extraction
+~~~~~~~~~~~~~~~
+
+It is possible to capture groups from the regular expression and log them into the
+alert events.
+
+There is 3 capabilities:
+
+* pkt: the extracted group is logged as pkt variable in ``metadata.pktvars``
+* alert: the extracted group is logged to the ``alert.extra`` subobject
+* flow: the extracted group is stored in a flow variable and end up in the ``metadata.flowvars``
+
+To use the feature, parameters of pcre keyword need to be updated.
+After the regular pcre regex and options, a comma separated lists of variable names.
+The prefix here is ``flow:``, ``pkt:`` or ``alert:`` and the names can contain special
+characters now. The names map to the capturing substring expressions in order ::
+
+  pcre:"/([a-z]+)\/[a-z]+\/(.+)\/(.+)\/changelog$/GUR, \
+      flow:ua/ubuntu/repo,flow:ua/ubuntu/pkg/base,     \
+      flow:ua/ubuntu/pkg/version";
+
+This would result in the alert event has something like ::
+
+  "metadata": {
+    "flowvars": [
+       {"ua/ubuntu/repo": "fr"},
+       {"ua/ubuntu/pkg/base": "curl"},
+       {"ua/ubuntu/pkg/version": "2.2.1"}
+    ]
+  }
+
+The other events on the same flow such as the ``flow`` one will
+also have the flow vars.
+
+If this is not wanted, you can use the ``alert:`` construct to only
+get the event in the alert ::
+
+  pcre:"/([a-z]+)\/[a-z]+\/(.+)\/(.+)\/changelog$/GUR, \
+      alert:ua/ubuntu/repo,alert:ua/ubuntu/pkg/base,     \
+      alert:ua/ubuntu/pkg/version";
+
+With that syntax, the result of the extraction will appear like ::
+
+  "alert": {
+    "extra": {
+       "ua/ubuntu/repo": "fr",
+       "ua/ubuntu/pkg/base": "curl",
+       "ua/ubuntu/pkg/version": "2.2.1"
+    ]
+  }
+
+A combination of the extraction scopes can be combined.
+
+It is also possible to extract key/value pair in the ``pkt`` scope.
+One capture would be the key, the second the value. The notation is similar to the last ::
+
+  pcre:"^/([A-Z]+) (.*)\r\n/, pkt:key,pkt:value";
+
+``key`` and ``value`` are simply hardcoded names to trigger the key/value extraction.
+As a consequence, they can't be used as name for the variables.
+
 Suricata's modifiers
 ~~~~~~~~~~~~~~~~~~~~
 

@@ -216,6 +216,11 @@
                 "xff": {
                     "type": "string"
                 },
+                "extra": {
+                    "type": "object",
+                    "additionalProperties": true,
+                    "description": "Extra data created by keywords such as datajson"
+                },
                 "metadata": {
                     "type": "object",
                     "properties": {
@@ -2802,7 +2807,7 @@
                                 "type": "string"
                             }
                         },
-                        "additionalProperties": false
+                        "additionalProperties": true
                     }
                 },
                 "flowints": {

@@ -194,6 +194,38 @@
             "required": 1,
         },
     ],
+    "datajson-add": [
+        {
+            "name": "setname",
+            "required": 1,
+        },
+        {
+            "name": "settype",
+            "required": 1,
+        },
+        {
+            "name": "datavalue",
+            "required": 1,
+        },
+        {
+            "name": "datajson",
+            "required": 1,
+        },
+    ],
+    "datajson-remove": [
+        {
+            "name": "setname",
+            "required": 1,
+        },
+        {
+            "name": "settype",
+            "required": 1,
+        },
+        {
+            "name": "datavalue",
+            "required": 1,
+        },
+    ],
     "get-flow-stats-by-id": [
         {
             "name": "flow_id",

@@ -113,6 +113,8 @@ def __init__(self, sck_path, verbose=False):
                 "memcap-show",
                 "dataset-add",
                 "dataset-remove",
+                "datajson-add",
+                "datajson-remove",
                 "get-flow-stats-by-id",
                 "dataset-clear",
                 "dataset-lookup",
@@ -218,6 +220,11 @@ def execute(self, command):
         cmd_specs = argsd[cmd]
         required_args_count = len([d["required"] for d in cmd_specs if d["required"] and not "val" in d])
         arguments = dict()
+        # if all arguments are required in the command then we split at the count
+        # this way we can handle last argument containing space (datajson-add for example)
+        non_req_args_count = len([d for d in cmd_specs if not d["required"] or "val" in d])
+        if non_req_args_count == 0:
+            full_cmd = command.split(maxsplit=required_args_count)
         for c, spec in enumerate(cmd_specs, 1):
             spec_type = str if "type" not in spec else spec["type"]
             if spec["required"]:

@@ -50,6 +50,7 @@ noinst_HEADERS = \
 	datasets.h \
 	datasets-ipv4.h \
 	datasets-ipv6.h \
+	datasets-json.h \
 	datasets-md5.h \
 	datasets-reputation.h \
 	datasets-sha256.h \
@@ -102,6 +103,7 @@ noinst_HEADERS = \
 	detect-config.h \
 	detect-content.h \
 	detect-csum.h \
+	detect-datajson.h \
 	detect-datarep.h \
 	detect-dataset.h \
 	detect-dce-iface.h \
@@ -667,6 +669,7 @@ libsuricata_c_a_SOURCES = \
 	detect-config.c \
 	detect-content.c \
 	detect-csum.c \
+	detect-datajson.c \
 	detect-datarep.c \
 	detect-dataset.c \
 	detect-dce-iface.c \

@@ -56,3 +56,37 @@ uint32_t IPv4Hash(uint32_t hash_seed, void *s)
 void IPv4Free(void *s)
 {
 }
+
+int IPv4JsonSet(void *dst, void *src)
+{
+    IPv4TypeJson *src_s = src;
+    IPv4TypeJson *dst_s = dst;
+    memcpy(dst_s->ipv4, src_s->ipv4, sizeof(dst_s->ipv4));
+    dst_s->json.value = src_s->json.value;
+    dst_s->json.len = src_s->json.len;
+
+    return 0;
+}
+
+bool IPv4JsonCompare(void *a, void *b)
+{
+    const IPv4TypeJson *as = a;
+    const IPv4TypeJson *bs = b;
+
+    return (memcmp(as->ipv4, bs->ipv4, sizeof(as->ipv4)) == 0);
+}
+
+uint32_t IPv4JsonHash(uint32_t hash_seed, void *s)
+{
+    const IPv4TypeJson *str = s;
+    return hashword((uint32_t *)str->ipv4, 1, hash_seed);
+}
+
+// data stays in hash
+void IPv4JsonFree(void *s)
+{
+    const IPv4TypeJson *as = s;
+    if (as->json.value) {
+        SCFree(as->json.value);
+    }
+}
@@ -25,15 +25,26 @@
 #define SURICATA_DATASETS_IPV4_H
 
 #include "datasets-reputation.h"
+#include "datasets-json.h"
 
 typedef struct IPv4Type {
     uint8_t ipv4[4];
     DataRepType rep;
 } IPv4Type;
 
+typedef struct IPv4TypeJson {
+    uint8_t ipv4[4];
+    DataJsonType json;
+} IPv4TypeJson;
+
 int IPv4Set(void *dst, void *src);
 bool IPv4Compare(void *a, void *b);
 uint32_t IPv4Hash(uint32_t hash_seed, void *s);
 void IPv4Free(void *s);
 
+int IPv4JsonSet(void *dst, void *src);
+bool IPv4JsonCompare(void *a, void *b);
+uint32_t IPv4JsonHash(uint32_t hash_seed, void *s);
+void IPv4JsonFree(void *s);
+
 #endif /* SURICATA_DATASETS_IPV4_H */
@@ -56,3 +56,36 @@ uint32_t IPv6Hash(uint32_t hash_seed, void *s)
 void IPv6Free(void *s)
 {
 }
+
+int IPv6JsonSet(void *dst, void *src)
+{
+    IPv6TypeJson *src_s = src;
+    IPv6TypeJson *dst_s = dst;
+    memcpy(dst_s->ipv6, src_s->ipv6, sizeof(dst_s->ipv6));
+    dst_s->json.value = src_s->json.value;
+    dst_s->json.len = src_s->json.len;
+
+    return 0;
+}
+
+bool IPv6JsonCompare(void *a, void *b)
+{
+    const IPv6TypeJson *as = a;
+    const IPv6TypeJson *bs = b;
+
+    return (memcmp(as->ipv6, bs->ipv6, sizeof(as->ipv6)) == 0);
+}
+
+uint32_t IPv6JsonHash(uint32_t hash_seed, void *s)
+{
+    const IPv6TypeJson *str = s;
+    return hashword((uint32_t *)str->ipv6, 4, hash_seed);
+}
+
+void IPv6JsonFree(void *s)
+{
+    const IPv6TypeJson *as = s;
+    if (as->json.value) {
+        SCFree(as->json.value);
+    }
+}