Skip to content

Commit

Permalink
GRRYaraScanner module (#673)
Browse files Browse the repository at this point in the history
* Add docker directory + files

* Update developer guide

* Add GRRYaraScanner and tests

* Linter's gonna lint

* Linter's gonna lint

* Update docstring

* Fix typo

* More verbosity in messages + grouping

* Bugfix

* Dedupe strings

* Fix grouping

* Bugfix

* Fix mypy errors
  • Loading branch information
tomchop authored Nov 25, 2022
1 parent 5230702 commit e5ab846
Show file tree
Hide file tree
Showing 6 changed files with 368 additions and 15 deletions.
1 change: 1 addition & 0 deletions dftimewolf/cli/dftimewolf_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
'GRRHuntOsqueryDownloader': 'dftimewolf.lib.collectors.grr_hunt',
'GRROsqueryCollector': 'dftimewolf.lib.collectors.grr_hosts',
'GRRTimelineCollector': 'dftimewolf.lib.collectors.grr_hosts',
'GRRYaraScanner': 'dftimewolf.lib.collectors.grr_hosts',
'LocalFilesystemCopy': 'dftimewolf.lib.exporters.local_filesystem',
'LocalPlasoProcessor': 'dftimewolf.lib.processors.localplaso',
'OsqueryCollector': 'dftimewolf.lib.collectors.osquery',
Expand Down
192 changes: 177 additions & 15 deletions dftimewolf/lib/collectors/grr_hosts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd

from grr_api_client import errors as grr_errors
from grr_api_client import flow
from grr_api_client.client import Client
from grr_response_proto import flows_pb2, jobs_pb2, timeline_pb2
from grr_response_proto import osquery_pb2 as osquery_flows
Expand Down Expand Up @@ -229,12 +230,12 @@ def _LaunchFlow(self, client: Client, name: str, args: str) -> str:
DFTimewolfError: If approvers are required but none were specified.
"""
# Start the flow and get the flow ID
flow = self._WrapGRRRequestWithApproval(
grr_flow = self._WrapGRRRequestWithApproval(
client, client.CreateFlow, self.logger, name=name, args=args)
if not flow:
if not grr_flow:
return ''

flow_id = flow.flow_id # type: str
flow_id = grr_flow.flow_id # type: str
self.PublishMessage(f'{flow_id}: Scheduled')

if self.keepalive:
Expand Down Expand Up @@ -323,9 +324,9 @@ def _DownloadFiles(self, client: Client, flow_id: str) -> Optional[str]:
Returns:
str: path containing the downloaded files.
"""
flow = client.Flow(flow_id)
grr_flow = client.Flow(flow_id)
is_timeline_flow = False
if flow.Get().data.name == 'TimelineFlow':
if grr_flow.Get().data.name == 'TimelineFlow':
is_timeline_flow = True
output_file_path = os.path.join(
self.output_path, '.'.join((flow_id, 'body')))
Expand All @@ -339,9 +340,9 @@ def _DownloadFiles(self, client: Client, flow_id: str) -> Optional[str]:
return None

if is_timeline_flow:
file_archive = flow.GetCollectedTimelineBody()
file_archive = grr_flow.GetCollectedTimelineBody()
else:
file_archive = flow.GetFilesArchive()
file_archive = grr_flow.GetFilesArchive()

file_archive.WriteToFile(output_file_path)

Expand All @@ -367,6 +368,166 @@ def GetThreadPoolSize(self) -> int:
"""Thread pool size."""
return GRR_THREAD_POOL_SIZE

class GRRYaraScanner(GRRFlow):
"""GRR Yara scanner.
Launches YaraProcessScans against one or multiple hosts, stores a pandas
DataFrame containing results.
"""

# can be overridden for internal modules to group on.
GROUPING_KEY = 'grouping_key'

# pylint: disable=arguments-differ
def __init__(self,
state: DFTimewolfState,
name: Optional[str]=None,
critical: bool=False) -> None:
super(GRRYaraScanner, self).__init__(
state, name=name, critical=critical)
self.process_regex = ''
self.rule_text = ''
self.rule_count = 0
self._grouping = ''

def SetUp(
self,
reason: str,
hostnames: str,
process_regex: str,
grr_server_url: str,
grr_username: str,
grr_password: str,
approvers: Optional[str] = None,
verify: bool = True,
skip_offline_clients: bool = False) -> None:

super().SetUp(
reason, grr_server_url, grr_username, grr_password,
approvers=approvers, verify=verify,
skip_offline_clients=skip_offline_clients)

for hostname in hostnames.strip().split(','):
hostname = hostname.strip()
if hostname:
self.state.StoreContainer(containers.Host(hostname=hostname))
self.state.DedupeContainers(containers.Host)

self.process_regex = process_regex
if self.process_regex:
try:
re.compile(self.process_regex)
except re.error as error:
self.ModuleError(
f'Invalid process_regex: {error}', critical=True)

def PreProcess(self) -> None:
"""Concatenates Yara rules into one stacked rule.
This is so we only launch one GRR Flow per host, instead of N Flows for N
rules that were stored upstream.
"""
yara_rules = self.state.GetContainers(containers.YaraRule)
if not yara_rules:
self.logger.warning('No Yara rules found.')
return
self.rule_text = '\n'.join([r.rule_text for r in yara_rules])
self.rule_count = len(yara_rules)
self._grouping = f'# GRR Yara Scan - {datetime.datetime.now()}'

def Process(self, container: containers.Host) -> None:
if not self.rule_count:
return

self.logger.info(
f'Running {self.rule_count} Yara sigs against {container.hostname}')

hits = 0
for client in self._FindClients([container.hostname]):
grr_hostname = client.data.os_info.fqdn
flow_args = flows_pb2.YaraProcessScanRequest(
yara_signature=self.rule_text,
ignore_grr_process=True,
process_regex=self.process_regex,
dump_process_on_match=False
)

flow_id = self._LaunchFlow(client, 'YaraProcessScan', flow_args)
self.logger.info(
f'Launched flow {flow_id} on {client.client_id} ({grr_hostname})')

self._AwaitFlow(client, flow_id)

# Get latest flow data from GRR server.
grr_flow = client.Flow(flow_id).Get()
results = list(grr_flow.ListResults())
yara_hits_df = self._YaraHitsToDataFrame(client, results)

if yara_hits_df.empty:
self.logger.info(f'{flow_id}: No Yara hits on {grr_hostname}'
f' ({client.client_id})')
return

self.PublishMessage(f'{flow_id}: found Yara hits on {grr_hostname}'
f' ({client.client_id})')
dataframe = containers.DataFrame(
data_frame=yara_hits_df,
description=(f'List of processes in {grr_hostname} ({client.client_id})'
' with Yara hits.'),
name=f'Yara matches on {grr_hostname} ({client.client_id})',
source='GRRYaraCollector')
dataframe.SetMetadata(self.GROUPING_KEY, self._grouping)
self.state.StoreContainer(dataframe)
hits += 1

self.state.StoreContainer(
containers.Report(
'GRRYaraScan', # actually used as report title
(f'{self._grouping}\nGRRYaraScan found {hits} Yara '
f'hits on {container.hostname}'),
text_format='markdown',
metadata={self.GROUPING_KEY: self._grouping},
))

def PostProcess(self) -> None:
"""Not implemented."""

def _YaraHitsToDataFrame(
self,
client: Client,
results: List[flow.FlowResult]) -> pd.DataFrame:
"""Converts results of a GRR YaraProcessScan Flow to a pandas DataFrame.
Args:
client: The GRR client object that had matches.
results: The FlowResult object obtained by calling ListResults on the
GRR Flow object.
Returns:
A pandas DataFrame containing client / process / signature match
information.
"""

entries = []
for r in results:
process = r.payload.process
for match in r.payload.match:
string_matches = set(sm.string_id for sm in match.string_matches)
entries.append({
'grr_client': client.client_id,
'grr_fqdn': client.data.os_info.fqdn,
'pid': process.pid,
'process': process.exe,
'username': process.username,
'cwd': process.cwd,
'rule_name': match.rule_name,
'string_matches': sorted(list(string_matches))
})
return pd.DataFrame(entries)

def GetThreadOnContainerType(self) -> Type[interface.AttributeContainer]:
"""This module operates on Host containers."""
return containers.Host

class GRRArtifactCollector(GRRFlow):
"""Artifact collector for GRR flows.
Expand Down Expand Up @@ -766,18 +927,18 @@ def _DownloadResults(self,
Returns:
List[pd.DataFrame]: the Osquery results.
"""
flow = client.Flow(flow_id)
list_results = list(flow.ListResults())
grr_flow = client.Flow(flow_id)
list_results = list(grr_flow.ListResults())

if not list_results:
self.logger.info(f'No rows returned for flow ID {str(flow)}')
self.logger.info(f'No rows returned for flow ID {str(grr_flow)}')
return list_results

results = []
for result in list_results:
payload = result.payload
if not isinstance(payload, osquery_flows.OsqueryResult):
self.logger.error(f'Incorrect results format from flow ID {flow}')
self.logger.error(f'Incorrect results format from flow ID {grr_flow}')
continue

headers = [column.name for column in payload.table.header.columns]
Expand Down Expand Up @@ -1113,15 +1274,15 @@ def _DownloadTimeline(self, client: Client, flow_id: str) -> Optional[str]:
f'{output_file_path:s} already exists: Skipping')
return None

flow = client.Flow(flow_id)
grr_flow = client.Flow(flow_id)
if self._timeline_format == 1:
ntfs_inodes = client.data.os_info.system.lower() == 'windows'
timeline = flow.GetCollectedTimelineBody(
timeline = grr_flow.GetCollectedTimelineBody(
timestamp_subsecond_precision=True,
inode_ntfs_file_reference_format=ntfs_inodes,
backslash_escape=True)
else:
timeline = flow.GetCollectedTimeline(self._timeline_format)
timeline = grr_flow.GetCollectedTimeline(self._timeline_format)
timeline.WriteToFile(output_file_path)

return output_file_path
Expand All @@ -1143,4 +1304,5 @@ def GetThreadOnContainerType(self) -> Type[interface.AttributeContainer]:
GRRFileCollector,
GRRFlowCollector,
GRROsqueryCollector,
GRRTimelineCollector])
GRRTimelineCollector,
GRRYaraScanner])
2 changes: 2 additions & 0 deletions dftimewolf/lib/containers/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ class YaraRule(interface.AttributeContainer):
name: The name of the Yara rule.
rule_text: The actual Yara rule string.
"""
CONTAINER_TYPE = 'yara_rule'

def __init__(self, name: str, rule_text: str) -> None:
super(YaraRule, self).__init__()
self.name = name
Expand Down
Loading

0 comments on commit e5ab846

Please sign in to comment.