Skip to content

Commit

Permalink
ground_truth: discard all background flows
Browse files Browse the repository at this point in the history
  • Loading branch information
AlyaGomaa committed Nov 5, 2024
1 parent 9d20975 commit e9641f4
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 24 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Ground truth flows are labeled using the netflow labeler. so each flow has a lab

# Limitations

* the labels in ground truth zeek dir have to be 'Malicious' or 'Benign' only. if any other label is present this tool will consider it "Benign"
* the labels in ground truth zeek dir have to be 'Malicious' or 'Benign' only. if any other label is present this tool will completely discard the flow.
* ground truth dirs can either be json or tab separated zeek dir or conn.log file

* all paths given as parameters to this tool must be absolute paths.
Expand Down
60 changes: 37 additions & 23 deletions parsers/ground_truth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import utils.timestamp_handler
from typing import (
Tuple,
Dict,
List,
Optional,
Union,
)
from re import findall
from parsers.config import ConfigurationParser
Expand Down Expand Up @@ -161,15 +161,13 @@ def extract_label_from_line(self, line:str) -> str:
:return: malicious, benign or unknown
"""
pattern = r"Malicious[\s\t]+"
matches = findall(pattern, line)
if matches:
if findall(pattern, line):
return 'malicious'

pattern = r"Benign[\s\t]+"
matches = findall(pattern, line)
if matches:
if findall(pattern, line):
return 'benign'

return 'unknown'

def update_labels_ctr(self, label: str):
Expand Down Expand Up @@ -199,7 +197,7 @@ def handle_zeek_json(self, line:str) -> Tuple[str,str,str,str]:
if not aid:
return False

label = line.get('label', '')
label = line.get('label', '')
self.update_labels_ctr(label)

return label, aid, line['ts'], line['id.orig_h']
Expand All @@ -225,37 +223,50 @@ def handle_zeek_tabs(self, line:str) -> Optional[Tuple[str,str,str,str]]:
# spaces so we can't use python's split()
# using regex split, split line when you encounter more than 2 spaces
# in a row
line = line.split('\t') if '\t' in line else split(r'\s{2,}', line)
line: List[str] = line.split('\t') if (
'\t' in line) \
else split(r'\s{2,'r'}', line)

aid = self.handle_getting_aid(line)
if not aid:
return

return label, aid, line[0], line[2]

def extract_fields(self, line: str) -> Optional[dict]:
def extract_fields(self, line: str) -> Tuple[Union[bool,dict], str]:
"""
extracts the label and community id from the given line
uses zeek_file_type to extract fields based on the type of the given zeek dir
uses zeek_file_type to extract fields based on the type of the given
zeek dir
completely ignores gt flows that have labels other than benign or
malicious
:param line: line as read from the zeek log file
:return: returns a flow dict with {'aid': ..., 'label':...}
:return:
If it managed to extract the flow, returns the
extracted flow dict and no errors
If not, returns False and the error
"""
if self.zeek_file_type == 'json':
#TODO this wasn't tested before ok?
flow = self.handle_zeek_json(line)
elif self.zeek_file_type == 'tab-separated':
flow = self.handle_zeek_tabs(line)


if not flow:
return False, "Invalid flow"

try:
if flow[0] == "unknown":
return False, f"Unsupported flow label '{flow[0]}'"

return {
'label': flow[0],
'aid': flow[1],
'timestamp': flow[2],
'srcip': flow[3],
}
except (IndexError, TypeError):
}, ""
except (IndexError, TypeError) as e:
# one of the above 2 methods failed to parse the given line
return
return False, f"Problem extracting flow: {line} .. {e}"


def register_timewindow(self, ts) -> dict:
Expand Down Expand Up @@ -364,7 +375,10 @@ def label_tw(self, flow: dict, tw_registration_stats: dict):

def parse_file(self, filename: str):
"""
extracts the label and community id from each flow and stores them in the db
extracts the label and community id from each flow and stores them
in the db
Completely ignores flows that dont have benign or malicious in
their labels, e.g background flows
:param filename: the name of the zeek logfile without the path,
for example conn.log
this can be the file given to this tool using -gtf or 1 file
Expand All @@ -380,11 +394,10 @@ def parse_file(self, filename: str):
if line.startswith('#'):
continue

flow = self.extract_fields(line)
flow, err = self.extract_fields(line)
if not flow:
self.log(f"Problem extracting flow "
f"from line number {line_number}: ",
line,
self.log(f"{err}. Skipping flow at line",
line_number,
error=True)
continue

Expand Down Expand Up @@ -498,7 +511,8 @@ def parse(self):
os._exit(0)
except Exception as e:
self.log("An error occurred: ", e, error=True)
self.log("",f"{traceback.format_exc()}", error=True)
self.log("",f"{traceback.format_exc()}",
error=True)
os._exit(1)


0 comments on commit e9641f4

Please sign in to comment.