From b2a7617ce045bf2fe5c05d6902815d822998a0d4 Mon Sep 17 00:00:00 2001
From: Vlad Arama <vlad.arama@ericsson.com>
Date: Mon, 29 Jan 2024 15:37:27 -0500
Subject: [PATCH 1/5] rename script.py to log_anonymizer.py

Signed-off-by: Vlad Arama <vlad.arama@ericsson.com>
---
 script.py => log_anonymizer.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename script.py => log_anonymizer.py (100%)

diff --git a/script.py b/log_anonymizer.py
similarity index 100%
rename from script.py
rename to log_anonymizer.py

From 82b035c6bfa4bac8fb3b7a066c024a07e2d8a280 Mon Sep 17 00:00:00 2001
From: Vlad Arama <86936229+vladarama@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:48:24 -0500
Subject: [PATCH 2/5] Update README.md

---
 README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b6b580b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,45 @@
+# Log Anonymizer
+
+Log Anonymizer is a Python-based tool for anonymizing sensitive information in log files. It helps maintaining privacy by masking sensitive data such as IP addresses, user IDs, endpoints, timestamps and so on.
+At the moment, the tool only supports HTTP, SSH and HA Proxy log formats. The Log Anonymizer can be used for other log formats, but complete anonymization might not occur, since the script might not be able to correctly identify sensitive information.
+
+## Features
+- Anonymize IP Addresses: Replaces original IP addresses with randomized ones, maintaining the same format.
+- Anonymize User IDs: Replaces user IDs with randomly generated names.
+- Anonymize Endpoints: Replaces original endpoints with their anonymized version, maintaining the same length.
+- Anonymize Timestamps (optional): Replaces timestamps with their anonymized version while keeping the same structure.
+- Selective Anonymization: Provides options to selectively anonymize only IPs, endpoints, user IDs, or timestamps (or any combination of them).
+- Lookup Table Creation: Generates lookup tables mapping anonymized data back to the original data for reference.
+
+## Requirements
+- Python 3.x
+- `namesgenerator` external library
+
+## Installation
+1 - Clone the repository or download the `log_anonymizer.py` file.
+
+2 - Ensure Python 3.x is installed on your system.
+
+3- Install the `namesgenerator` library by running `pip install namesgenerator` in your terminal.
+
+## Usage
+`python log_anonymizer.py <input_directory> <output_directory> [options]`
+
+### Arguments
+- input_directory: Directory containing the log files to be anonymized.
+- output_directory: Directory where the anonymized files and lookup tables will be stored.
+  
+### Options
+- --ip: Anonymize only IP addresses.
+- --endpoint: Anonymize only endpoints.
+- --user: Anonymize only user IDs.
+- --timestamps: Anonymize only timestamps.
+
+### Output
+- Anonymized log files will be saved in the specified output directory.
+- A lookup table for each file will be generated in the same output directory, mapping anonymized data to original data.
+
+## Example
+`python log_anonymizer.py /path/to/logs /path/to/output --ip --user`
+
+This command will anonymize IP addresses and user IDs in log files located in `/path/to/logs` and save the anonymized logs to `/path/to/output`.

From 1186f99c1c3eead9a05300a7ae64e7fdea9c811a Mon Sep 17 00:00:00 2001
From: Matthew Khouzam <matthew.khouzam@ericsson.com>
Date: Mon, 29 Jan 2024 23:24:29 -0500
Subject: [PATCH 3/5] Update log_anonymizer.py

Use Black Code style and pylint checker
---
 log_anonymizer.py | 353 +++++++++++++++++++++++++++++-----------------
 1 file changed, 226 insertions(+), 127 deletions(-)

diff --git a/log_anonymizer.py b/log_anonymizer.py
index d150415..a35abd2 100644
--- a/log_anonymizer.py
+++ b/log_anonymizer.py
@@ -1,33 +1,44 @@
+"""
+Log Anonymizer
+
+Anonymizes log timestamp, IPs and endpoints.
+"""
 import os
 import random
 import re
-import namesgenerator
 import argparse
+import namesgenerator
+
+__author__ = "Vlad Arama"
+__copyright__ = "Copyright 2024, Ericsson"
+__credits__ = ["Vlad Arama"]
+__license__ = "MIT"
 
 # General Regex Patterns
-ip_pattern = r'\b\d{1,3}(?:\.\d{1,3}){2,}\b'
-timestamps_pattern = r'(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})'
-endpoint_pattern = r'\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?'
-user_id_pattern = r'\s[a-z][a-z0-9]{4,19}\s'
+IP_PATTERN = r"\b\d{1,3}(?:\.\d{1,3}){2,}\b"
+TIMESTAMP_PATTERN = r"(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})"
+ENDPOINT_PATTERN = r"\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?"
+USER_ID_PATTERN = r"\s[a-z][a-z0-9]{4,19}\s"
 
 # Specific Regex Patterns
-httpd_pattern = r'\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\"'
-sshd_pattern = r'\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))'
-ha_proxy_pattern = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
+HTTPD_PATTERN = r"\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\""
+SSHD_PATTERN = r"\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))"
+HA_PROXY_PATTERN = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
+
+FILES_TO_EXCLUDE = [".gz", ".md5", ".sha1", ".sha256", ".zip"]
 
-files_to_exclude = ['.gz', '.md5', '.sha1', '.sha256', '.zip']
 
-def anonymize_ip(match):
+def anonymize_ip(matched_pattern) -> str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns an anonymized IP address.
 
     """
-    original_ip = match.group(0)
+    original_ip = matched_pattern.group(0)
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
-        ip_parts = original_ip.split('.')
+        ip_parts = original_ip.split(".")
         anonymized_ip_parts = []
         for part in ip_parts:
             anonymized_part = str(randomize_numbers(part, True))
@@ -37,19 +48,20 @@ def anonymize_ip(match):
 
     return anonymized_ip
 
-def anonymize_ip_ha(match):
+
+def anonymize_ip_ha(matched_pattern) -> str:
     """
     Takes a regex match, representing a line in an HA Proxy log file.
     Returns a line with all of the IP addresses and their ports anonymized.
 
     """
-    original_ip = match.group(5)
-    original_port = match.group(6)
+    original_ip = matched_pattern.group(5)
+    original_port = matched_pattern.group(6)
 
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
-        ip_parts = original_ip.split('.')
+        ip_parts = original_ip.split(".")
         anonymized_ip_parts = [str(randomize_numbers(part, True)) for part in ip_parts]
         anonymized_ip = ".".join(anonymized_ip_parts)
         lookup_table[original_ip] = anonymized_ip
@@ -60,42 +72,54 @@ def anonymize_ip_ha(match):
         anonymized_port = str(randomize_numbers(original_port, False))
         lookup_table[original_port] = anonymized_port
 
-    return ''.join([match.string[:match.start(5)], anonymized_ip, match.string[match.end(5):match.start(6)], anonymized_port, match.string[match.end(6):]])
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(5)],
+            anonymized_ip,
+            matched_pattern.string[matched_pattern.end(5) : matched_pattern.start(6)],
+            anonymized_port,
+            matched_pattern.string[matched_pattern.end(6) :],
+        ]
+    )
+
 
-def anonymize_ip_line(line, filename):
+def anonymize_ip_line(line: str, filename: str) -> str:
     """
-    Takes a specific line from a file and its filename. 
+    Takes a specific line from a file and its filename.
     Returns the line with all of the IP addresses anonymized.
-    
+
     """
-    if 'ha' in filename:
-        line = re.sub(ha_proxy_pattern, anonymize_ip_ha, line)
+    if "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_ip_ha, line)
     else:
-        line = re.sub(ip_pattern, anonymize_ip, line)
+        line = re.sub(IP_PATTERN, anonymize_ip, line)
 
-    return line.rstrip() + '\n'
+    return line.rstrip() + "\n"
 
-def anonymize_timestamps(match):
+
+def anonymize_timestamps(matched_pattern) -> str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns an anonymized timestamp that has the same length as the original one.
+    This will break monotonicity.
 
     """
-    original_ip = match.group(0)
-    if original_ip in lookup_table:
-        anonymized_ip = lookup_table[original_ip]
+    original_timestamp = matched_pattern.group(0)
+    if original_timestamp in lookup_table:
+        anonymized_ip = lookup_table[original_timestamp]
     else:
-        ip_parts = original_ip.split(':')
+        ip_parts = original_timestamp.split(":")
         anonymized_ip_parts = []
         for part in ip_parts:
             anonymized_part = str(randomize_numbers(part))
             anonymized_ip_parts.append(anonymized_part)
         anonymized_ip = ":".join(anonymized_ip_parts)
-        lookup_table[original_ip] = anonymized_ip
+        lookup_table[original_timestamp] = anonymized_ip
 
     return anonymized_ip
 
-def anonymize_user_id(user_id):
+
+def anonymize_user_id(user_id:str) ->str:
     """
     Takes a user id or any sensitive name and anonymizes it.
 
@@ -105,129 +129,164 @@ def anonymize_user_id(user_id):
     else:
         anonymized_user_id = namesgenerator.get_random_name()
         lookup_table[user_id] = anonymized_user_id
-    
+
     return anonymized_user_id
-    
-def anonymize_user_id_general(match):
+
+
+def anonymize_user_id_general(matched_pattern:str)->str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns a randomly generated user id.
 
     """
-    original_user_id = match.group(0)
+    original_user_id = matched_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
 
-    return ' ' + anonymized_user_id + ' ' 
+    return " " + anonymized_user_id + " "
+
 
-def anonymize_user_id_httpd_sshd(match):
+def anonymize_user_id_httpd_sshd(matched_pattern)->str:
     """
     Takes a regex match, representing a line in an HTTP or SSH log file.
     Returns a line with all of the user information anonymized.
 
     """
-    original_user_id = match.group(3)
-    if original_user_id == '-':
-        return match.group(0)
+    original_user_id = matched_pattern.group(3)
+    if original_user_id == "-":
+        return matched_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
 
-    return ''.join([match.string[:match.start(3)], anonymized_user_id, match.string[match.end(3):]])
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(3)],
+            anonymized_user_id,
+            matched_pattern.string[matched_pattern.end(3) :],
+        ]
+    )
 
-def anonymize_sensitive_info_ha(match):
+
+def anonymize_sensitive_info_ha(matched_pattern)->str:
     """
     Takes a regex match, representing a line in an HA Proxy log file.
     Returns a line with all of the sensitive information anonymized.
 
     """
     # Validate that sensitive information is present
-    if match.group(18) == None or match.group(19) == None:
-        return match.group(0)
+    if not (matched_pattern.group(18) and matched_pattern.group(19)):
+        return matched_pattern.group(0)
 
-    original_info1 = match.group(18)
-    original_info2 = match.group(19)
+    original_info1 = matched_pattern.group(18)
+    original_info2 = matched_pattern.group(19)
     anonymized_info1 = anonymize_user_id(original_info1)
     anonymized_info2 = anonymize_user_id(original_info2)
 
-    return ''.join([match.string[:match.start(18)], anonymized_info1, match.string[match.end(18):match.start(19)], anonymized_info2, match.string[match.end(19):]])
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(18)],
+            anonymized_info1,
+            matched_pattern.string[matched_pattern.end(18) : matched_pattern.start(19)],
+            anonymized_info2,
+            matched_pattern.string[matched_pattern.end(19) :],
+        ]
+    )
 
-def anonymize_user_line(line, filename):
+
+def anonymize_user_line(line:str, filename:str)->str:
     """
-    Takes a specific line from a file and its filename. 
+    Takes a specific line from a file and its filename.
     Returns the line with all of the user information and sensitive information anonymized.
-    
-    """
-    if 'httpd' in filename:
-        line = re.sub(httpd_pattern, anonymize_user_id_httpd_sshd, line)
-    elif 'sshd' in filename:
-        line = re.sub(sshd_pattern, anonymize_user_id_httpd_sshd, line)
-    elif 'ha' in filename:
-        line = re.sub(ha_proxy_pattern, anonymize_sensitive_info_ha, line)
+
+    """
+    if "httpd" in filename:
+        line = re.sub(HTTPD_PATTERN, anonymize_user_id_httpd_sshd, line)
+    elif "sshd" in filename:
+        line = re.sub(SSHD_PATTERN, anonymize_user_id_httpd_sshd, line)
+    elif "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_sensitive_info_ha, line)
     else:
-        line = re.sub(user_id_pattern, anonymize_user_id_general, line)
+        line = re.sub(USER_ID_PATTERN, anonymize_user_id_general, line)
 
-    return line.rstrip() + '\n'
+    return line.rstrip() + "\n"
 
-def anonymize_endpoint(original_endpoint):
+
+def anonymize_endpoint(original_endpoint)->str:
     """
     Takes an endpoint and returns its anonymized version.
 
     """
-    endpoint_parts = original_endpoint.strip('/').split('/')
+    endpoint_parts = original_endpoint.strip("/").split("/")
     anonymized_parts = []
     for part in endpoint_parts:
         if part in lookup_table:
             anonymized_part = lookup_table[part]
-        else: 
+        else:
             anonymized_part = namesgenerator.get_random_name()
             lookup_table[part] = anonymized_part
         anonymized_parts.append(anonymized_part)
-    
-    if original_endpoint.startswith('/'):
-        anonymized_endpoint = '/' + '/'.join(anonymized_parts)
+
+    if original_endpoint.startswith("/"):
+        anonymized_endpoint = "/" + "/".join(anonymized_parts)
     else:
-        anonymized_endpoint = '/'.join(anonymized_parts)
+        anonymized_endpoint = "/".join(anonymized_parts)
 
     return anonymized_endpoint
 
-def anonymize_endpoint_general(match):
+
+def anonymize_endpoint_general(match)->str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns the anonymized endpoint.
-    
+
     """
     original_endpoint = match.group(0)
     return anonymize_endpoint(original_endpoint)
 
-def anonymize_endpoint_httpd(match):
+
+def anonymize_endpoint_httpd(match)->str:
     """
     Takes a regex match, representing a line in an HTTP log file.
     Returns the anonymized endpoint.
-    
+
     """
     original_endpoint = match.group(6)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
 
-    return ''.join([match.string[:match.start(6)], anonymized_endpoint, match.string[match.end(6):]])
+    return "".join(
+        [
+            match.string[: match.start(6)],
+            anonymized_endpoint,
+            match.string[match.end(6) :],
+        ]
+    )
+
 
-def anonymize_endpoint_sshd(match):
+def anonymize_endpoint_sshd(match)->str:
     """
     Takes a regex match, representing a line in an SSH log file.
     Returns the anonymized endpoint.
-    
+
     """
     original_endpoint = match.group(5)
-    parts = original_endpoint.split(' ', 1)
-    endpoint_part = parts[0].strip('/').split('/')
-    remaining_string = parts[1] if len(parts) > 1 else ''
+    parts = original_endpoint.split(" ", 1)
+    endpoint_part = parts[0].strip("/").split("/")
+    remaining_string = parts[1] if len(parts) > 1 else ""
 
     if len(endpoint_part) < 2:
         return match.group(0)
 
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
-    anonymized_endpoint += ' ' + remaining_string if remaining_string else ''
+    anonymized_endpoint += " " + remaining_string if remaining_string else ""
+
+    return "".join(
+        [
+            match.string[: match.start(5)],
+            anonymized_endpoint,
+            match.string[match.end(5) :],
+        ]
+    )
 
-    return ''.join([match.string[:match.start(5)], anonymized_endpoint, match.string[match.end(5):]])
 
-def anonymize_endpoint_ha(match):
+def anonymize_endpoint_ha(match)->str:
     """
     Takes a regex match, representing a line in an HA Proxy log file.
     Returns a line with all of the endpoints anonymized.
@@ -235,32 +294,48 @@ def anonymize_endpoint_ha(match):
     """
     original_endpoint = match.group(9)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
-    anonymized_line =  ''.join([match.string[:match.start(9)], anonymized_endpoint, match.string[match.end(9):]])
-
-    if match.group(21) != None:
+    anonymized_line = "".join(
+        [
+            match.string[: match.start(9)],
+            anonymized_endpoint,
+            match.string[match.end(9) :],
+        ]
+    )
+
+    if match.group(21):
         original_endpoint2 = match.group(21)
         anonymized_endpoint2 = anonymize_endpoint(original_endpoint2)
-        anonymized_line = ''.join([match.string[:match.start(9)], anonymized_endpoint, match.string[match.end(9):match.start(21)], anonymized_endpoint2, match.string[match.end(21):]])
+        anonymized_line = "".join(
+            [
+                match.string[: match.start(9)],
+                anonymized_endpoint,
+                match.string[match.end(9) : match.start(21)],
+                anonymized_endpoint2,
+                match.string[match.end(21) :],
+            ]
+        )
 
     return anonymized_line
 
-def anonymize_endpoint_line(line, filename):
+
+def anonymize_endpoint_line(line, filename)->str:
     """
-    Takes a specific line from a file and its filename. 
+    Takes a specific line from a file and its filename.
     Returns the line with all of the endpoints anonymized.
-    
-    """
-    if 'httpd' in filename:
-        line = re.sub(httpd_pattern, anonymize_endpoint_httpd, line)
-    elif 'sshd' in filename:
-        line = re.sub(sshd_pattern, anonymize_endpoint_sshd, line)
-    elif 'ha' in filename:
-        line = re.sub(ha_proxy_pattern, anonymize_endpoint_ha, line)
+
+    """
+    if "httpd" in filename:
+        line = re.sub(HTTPD_PATTERN, anonymize_endpoint_httpd, line)
+    elif "sshd" in filename:
+        line = re.sub(SSHD_PATTERN, anonymize_endpoint_sshd, line)
+    elif "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_endpoint_ha, line)
     else:
-        line = re.sub(endpoint_pattern, anonymize_endpoint_general, line)
-    return line.rstrip() + '\n'
+        line = re.sub(ENDPOINT_PATTERN, anonymize_endpoint_general, line)
+    return line.rstrip() + "\n"
+
 
-def randomize_numbers(number, is_ip_address = False):
+def randomize_numbers(number, is_ip_address:bool=False)->str:
     """
     Takes a number and a flag to indicate if the given number is part of an IP address.
     Returns a random number that has the same length as the original one. If the number is part of an IP address,
@@ -268,8 +343,8 @@ def randomize_numbers(number, is_ip_address = False):
 
     """
     num_len = len(str(number))
-    lower_bound = 10**(num_len - 1)
-    if (is_ip_address):
+    lower_bound = 10 ** (num_len - 1)
+    if is_ip_address:
         upper_bound = min(255, (10**num_len - 1))
     else:
         upper_bound = 10**num_len - 1
@@ -280,47 +355,71 @@ def randomize_numbers(number, is_ip_address = False):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Anonymize log files. Anonymized files and their lookup tables are storedd in the `anonymized-logs` folder")
-    parser.add_argument('input_directory', type=str, help='Directory containing the log files to be anonymized')
-    parser.add_argument('output_directory', type=str, help='Directory where the anonymized files will be stored')
-    parser.add_argument('--ip', action='store_true', help='Only Anonymize IP Addresses')
-    parser.add_argument('--endpoint', action='store_true', help='Only Anonymize Endpoints')
-    parser.add_argument('--user', action='store_true', help='Only Anonymize User IDs')
-    parser.add_argument('--timestamps', action='store_true', help='Only Anonymize Timestamps')
+    parser = argparse.ArgumentParser(
+        description="Anonymize log files. Anonymized files and their lookup tables are storedd in the `anonymized-logs` folder"
+    )
+    parser.add_argument(
+        "input_directory",
+        type=str,
+        help="Directory containing the log files to be anonymized",
+    )
+    parser.add_argument(
+        "output_directory",
+        type=str,
+        help="Directory where the anonymized files will be stored",
+    )
+    parser.add_argument("--ip", action="store_true", help="Only Anonymize IP Addresses")
+    parser.add_argument(
+        "--endpoint", action="store_true", help="Only Anonymize Endpoints"
+    )
+    parser.add_argument("--user", action="store_true", help="Only Anonymize User IDs")
+    parser.add_argument(
+        "--timestamps", action="store_true", help="Only Anonymize Timestamps"
+    )
 
     args = parser.parse_args()
 
-    base_directory = args.input_directory 
+    base_directory = args.input_directory
     output_directory = args.output_directory
     os.makedirs(output_directory, exist_ok=True)
 
-    for filename in os.listdir(base_directory):
-        if not filename.endswith(tuple(files_to_exclude)):
-            input_file_path = os.path.join(base_directory, filename)
-            output_file_path = os.path.join(output_directory, f'anonymized_{filename}')
-            lookup_file_path = os.path.join(output_directory, f'lookup_table_{filename}.txt')
+    for file_name in os.listdir(base_directory):
+        if not file_name.lower().endswith(tuple(FILES_TO_EXCLUDE)):
+            input_file_path = os.path.join(base_directory, file_name)
+            output_file_path = os.path.join(output_directory, f"anonymized_{file_name}")
+            lookup_file_path = os.path.join(
+                output_directory, f"lookup_table_{file_name}.txt"
+            )
             lookup_table = {}
 
-            with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
-                for line in input_file:
+            with open(
+                file=input_file_path, mode="r", encoding="utf-8"
+            ) as input_file, open(
+                file=output_file_path, mode="w", encoding="utf-8"
+            ) as output_file:
+                for current_line in input_file:
                     if args.ip:
-                        line = anonymize_ip_line(line, filename)
+                        current_line = anonymize_ip_line(current_line, file_name)
                     if args.endpoint:
-                        line = anonymize_endpoint_line(line, filename)
+                        current_line = anonymize_endpoint_line(current_line, file_name)
                     if args.user:
-                        line = anonymize_user_line(line, filename)
+                        current_line = anonymize_user_line(current_line, file_name)
                     if args.timestamps:
-                        line = re.sub(timestamps, anonymize_timestamps, line)
+                        re_match = re.match(TIMESTAMP_PATTERN, current_line)
+                        if re_match:
+                            new_ts = anonymize_timestamps(re_match)
+                            if new_ts:
+                                current_line = re.sub(TIMESTAMP_PATTERN, new_ts, current_line)
                     if not (args.ip or args.timestamps or args.endpoint or args.user):
-                        line = anonymize_ip_line(line, filename)
-                        line = anonymize_endpoint_line(line, filename)
-                        line = anonymize_user_line(line, filename)
-                    
-                    output_file.write(line)
-                    
-            with open(lookup_file_path, 'w') as lookup_file:
+                        current_line = anonymize_ip_line(current_line, file_name)
+                        current_line = anonymize_endpoint_line(current_line, file_name)
+                        current_line = anonymize_user_line(current_line, file_name)
+
+                    output_file.write(current_line)
+
+            with open(file=lookup_file_path, mode="w", encoding="utf-8") as lookup_file:
                 for original_data, anonymized_data in lookup_table.items():
-                    lookup_file.write(f'{anonymized_data} -> {original_data}\n')
+                    lookup_file.write(f"{anonymized_data} -> {original_data}\n")
 
-            print(f'Logs in {filename} anonymized and saved to {output_file_path}')
-            print(f'Lookup table for {filename} saved to {lookup_file_path} \n')
\ No newline at end of file
+            print(f"Logs in {file_name} anonymized and saved to {output_file_path}")
+            print(f"Lookup table for {file_name} saved to {lookup_file_path} \n")

From bd3aaa1cae8b59eabb9a2b5fd925cbb1197d3381 Mon Sep 17 00:00:00 2001
From: MatthewKhouzam <matthew.khouzam@ericsson.com>
Date: Tue, 30 Jan 2024 13:52:03 +0000
Subject: [PATCH 4/5] Add progress bars

Signed-off-by: MatthewKhouzam <matthew.khouzam@ericsson.com>
---
 log_anonymizer.py | 201 +++++++++++++++++++++++++++++++++++++++++++---
 requirements.txt  |   1 +
 2 files changed, 192 insertions(+), 10 deletions(-)

diff --git a/log_anonymizer.py b/log_anonymizer.py
index a35abd2..b872b85 100644
--- a/log_anonymizer.py
+++ b/log_anonymizer.py
@@ -1,3 +1,5 @@
+#! /usr/bin/env python3
+
 """
 Log Anonymizer
 
@@ -7,7 +9,9 @@
 import random
 import re
 import argparse
+import argparse
 import namesgenerator
+import tqdm
 
 __author__ = "Vlad Arama"
 __copyright__ = "Copyright 2024, Ericsson"
@@ -19,11 +23,20 @@
 TIMESTAMP_PATTERN = r"(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})"
 ENDPOINT_PATTERN = r"\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?"
 USER_ID_PATTERN = r"\s[a-z][a-z0-9]{4,19}\s"
+IP_PATTERN = r"\b\d{1,3}(?:\.\d{1,3}){2,}\b"
+TIMESTAMP_PATTERN = r"(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})"
+ENDPOINT_PATTERN = r"\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?"
+USER_ID_PATTERN = r"\s[a-z][a-z0-9]{4,19}\s"
 
 # Specific Regex Patterns
 HTTPD_PATTERN = r"\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\""
 SSHD_PATTERN = r"\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))"
 HA_PROXY_PATTERN = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
+HTTPD_PATTERN = r"\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\""
+SSHD_PATTERN = r"\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))"
+HA_PROXY_PATTERN = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
+
+FILES_TO_EXCLUDE = [".gz", ".md5", ".sha1", ".sha256", ".zip"]
 
 FILES_TO_EXCLUDE = [".gz", ".md5", ".sha1", ".sha256", ".zip"]
 
@@ -34,10 +47,11 @@ def anonymize_ip(matched_pattern) -> str:
     Returns an anonymized IP address.
 
     """
-    original_ip = matched_pattern.group(0)
+    original_ip = matched_patterned_pattern.group(0)
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
+        ip_parts = original_ip.split(".")
         ip_parts = original_ip.split(".")
         anonymized_ip_parts = []
         for part in ip_parts:
@@ -55,12 +69,13 @@ def anonymize_ip_ha(matched_pattern) -> str:
     Returns a line with all of the IP addresses and their ports anonymized.
 
     """
-    original_ip = matched_pattern.group(5)
-    original_port = matched_pattern.group(6)
+    original_ip = matched_patterned_pattern.group(5)
+    original_port = matched_patterned_pattern.group(6)
 
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
+        ip_parts = original_ip.split(".")
         ip_parts = original_ip.split(".")
         anonymized_ip_parts = [str(randomize_numbers(part, True)) for part in ip_parts]
         anonymized_ip = ".".join(anonymized_ip_parts)
@@ -82,19 +97,36 @@ def anonymize_ip_ha(matched_pattern) -> str:
         ]
     )
 
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(5)],
+            anonymized_ip,
+            matched_pattern.string[matched_pattern.end(5) : matched_pattern.start(6)],
+            anonymized_port,
+            matched_pattern.string[matched_pattern.end(6) :],
+        ]
+    )
+
 
 def anonymize_ip_line(line: str, filename: str) -> str:
     """
     Takes a specific line from a file and its filename.
+    Takes a specific line from a file and its filename.
     Returns the line with all of the IP addresses anonymized.
 
+
     """
+    if "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_ip_ha, line)
     if "ha" in filename:
         line = re.sub(HA_PROXY_PATTERN, anonymize_ip_ha, line)
     else:
         line = re.sub(IP_PATTERN, anonymize_ip, line)
 
     return line.rstrip() + "\n"
+        line = re.sub(IP_PATTERN, anonymize_ip, line)
+
+    return line.rstrip() + "\n"
 
 
 def anonymize_timestamps(matched_pattern) -> str:
@@ -102,12 +134,17 @@ def anonymize_timestamps(matched_pattern) -> str:
     Takes a regex match, representing a line in any type of log file.
     Returns an anonymized timestamp that has the same length as the original one.
     This will break monotonicity.
+    This will break monotonicity.
 
     """
     original_timestamp = matched_pattern.group(0)
+    if original_timestamp in lookup_table:
+        anonymized_ip = lookup_table[original_timestamp]
+    original_timestamp = matched_pattern.group(0)
     if original_timestamp in lookup_table:
         anonymized_ip = lookup_table[original_timestamp]
     else:
+        ip_parts = original_timestamp.split(":")
         ip_parts = original_timestamp.split(":")
         anonymized_ip_parts = []
         for part in ip_parts:
@@ -115,6 +152,7 @@ def anonymize_timestamps(matched_pattern) -> str:
             anonymized_ip_parts.append(anonymized_part)
         anonymized_ip = ":".join(anonymized_ip_parts)
         lookup_table[original_timestamp] = anonymized_ip
+        lookup_table[original_timestamp] = anonymized_ip
 
     return anonymized_ip
 
@@ -130,6 +168,7 @@ def anonymize_user_id(user_id:str) ->str:
         anonymized_user_id = namesgenerator.get_random_name()
         lookup_table[user_id] = anonymized_user_id
 
+
     return anonymized_user_id
 
 
@@ -139,11 +178,13 @@ def anonymize_user_id_general(matched_pattern:str)->str:
     Returns a randomly generated user id.
 
     """
-    original_user_id = matched_pattern.group(0)
+    original_user_id = matched_patterned_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
 
     return " " + anonymized_user_id + " "
 
+    return " " + anonymized_user_id + " "
+
 
 def anonymize_user_id_httpd_sshd(matched_pattern)->str:
     """
@@ -152,6 +193,9 @@ def anonymize_user_id_httpd_sshd(matched_pattern)->str:
 
     """
     original_user_id = matched_pattern.group(3)
+    if original_user_id == "-":
+        return matched_pattern.group(0)
+    original_user_id = matched_pattern.group(3)
     if original_user_id == "-":
         return matched_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
@@ -164,6 +208,14 @@ def anonymize_user_id_httpd_sshd(matched_pattern)->str:
         ]
     )
 
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(3)],
+            anonymized_user_id,
+            matched_pattern.string[matched_pattern.end(3) :],
+        ]
+    )
+
 
 def anonymize_sensitive_info_ha(matched_pattern)->str:
     """
@@ -174,7 +226,11 @@ def anonymize_sensitive_info_ha(matched_pattern)->str:
     # Validate that sensitive information is present
     if not (matched_pattern.group(18) and matched_pattern.group(19)):
         return matched_pattern.group(0)
+    if not (matched_pattern.group(18) and matched_pattern.group(19)):
+        return matched_pattern.group(0)
 
+    original_info1 = matched_pattern.group(18)
+    original_info2 = matched_pattern.group(19)
     original_info1 = matched_pattern.group(18)
     original_info2 = matched_pattern.group(19)
     anonymized_info1 = anonymize_user_id(original_info1)
@@ -190,13 +246,31 @@ def anonymize_sensitive_info_ha(matched_pattern)->str:
         ]
     )
 
+    return "".join(
+        [
+            matched_pattern.string[: matched_pattern.start(18)],
+            anonymized_info1,
+            matched_pattern.string[matched_pattern.end(18) : matched_pattern.start(19)],
+            anonymized_info2,
+            matched_pattern.string[matched_pattern.end(19) :],
+        ]
+    )
+
 
 def anonymize_user_line(line:str, filename:str)->str:
     """
     Takes a specific line from a file and its filename.
+    Takes a specific line from a file and its filename.
     Returns the line with all of the user information and sensitive information anonymized.
 
+
     """
+    if "httpd" in filename:
+        line = re.sub(HTTPD_PATTERN, anonymize_user_id_httpd_sshd, line)
+    elif "sshd" in filename:
+        line = re.sub(SSHD_PATTERN, anonymize_user_id_httpd_sshd, line)
+    elif "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_sensitive_info_ha, line)
     if "httpd" in filename:
         line = re.sub(HTTPD_PATTERN, anonymize_user_id_httpd_sshd, line)
     elif "sshd" in filename:
@@ -207,6 +281,9 @@ def anonymize_user_line(line:str, filename:str)->str:
         line = re.sub(USER_ID_PATTERN, anonymize_user_id_general, line)
 
     return line.rstrip() + "\n"
+        line = re.sub(USER_ID_PATTERN, anonymize_user_id_general, line)
+
+    return line.rstrip() + "\n"
 
 
 def anonymize_endpoint(original_endpoint)->str:
@@ -215,19 +292,25 @@ def anonymize_endpoint(original_endpoint)->str:
 
     """
     endpoint_parts = original_endpoint.strip("/").split("/")
+    endpoint_parts = original_endpoint.strip("/").split("/")
     anonymized_parts = []
     for part in endpoint_parts:
         if part in lookup_table:
             anonymized_part = lookup_table[part]
+        else:
         else:
             anonymized_part = namesgenerator.get_random_name()
             lookup_table[part] = anonymized_part
         anonymized_parts.append(anonymized_part)
 
+    if original_endpoint.startswith("/"):
+        anonymized_endpoint = "/" + "/".join(anonymized_parts)
+
     if original_endpoint.startswith("/"):
         anonymized_endpoint = "/" + "/".join(anonymized_parts)
     else:
         anonymized_endpoint = "/".join(anonymized_parts)
+        anonymized_endpoint = "/".join(anonymized_parts)
 
     return anonymized_endpoint
 
@@ -236,7 +319,6 @@ def anonymize_endpoint_general(match)->str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns the anonymized endpoint.
-
     """
     original_endpoint = match.group(0)
     return anonymize_endpoint(original_endpoint)
@@ -247,6 +329,7 @@ def anonymize_endpoint_httpd(match)->str:
     Takes a regex match, representing a line in an HTTP log file.
     Returns the anonymized endpoint.
 
+
     """
     original_endpoint = match.group(6)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
@@ -259,23 +342,42 @@ def anonymize_endpoint_httpd(match)->str:
         ]
     )
 
+    return "".join(
+        [
+            match.string[: match.start(6)],
+            anonymized_endpoint,
+            match.string[match.end(6) :],
+        ]
+    )
+
 
 def anonymize_endpoint_sshd(match)->str:
     """
     Takes a regex match, representing a line in an SSH log file.
     Returns the anonymized endpoint.
-
     """
     original_endpoint = match.group(5)
     parts = original_endpoint.split(" ", 1)
     endpoint_part = parts[0].strip("/").split("/")
     remaining_string = parts[1] if len(parts) > 1 else ""
+    parts = original_endpoint.split(" ", 1)
+    endpoint_part = parts[0].strip("/").split("/")
+    remaining_string = parts[1] if len(parts) > 1 else ""
 
     if len(endpoint_part) < 2:
         return match.group(0)
 
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
     anonymized_endpoint += " " + remaining_string if remaining_string else ""
+    anonymized_endpoint += " " + remaining_string if remaining_string else ""
+
+    return "".join(
+        [
+            match.string[: match.start(5)],
+            anonymized_endpoint,
+            match.string[match.end(5) :],
+        ]
+    )
 
     return "".join(
         [
@@ -290,7 +392,6 @@ def anonymize_endpoint_ha(match)->str:
     """
     Takes a regex match, representing a line in an HA Proxy log file.
     Returns a line with all of the endpoints anonymized.
-
     """
     original_endpoint = match.group(9)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
@@ -301,7 +402,15 @@ def anonymize_endpoint_ha(match)->str:
             match.string[match.end(9) :],
         ]
     )
+    anonymized_line = "".join(
+        [
+            match.string[: match.start(9)],
+            anonymized_endpoint,
+            match.string[match.end(9) :],
+        ]
+    )
 
+    if match.group(21):
     if match.group(21):
         original_endpoint2 = match.group(21)
         anonymized_endpoint2 = anonymize_endpoint(original_endpoint2)
@@ -314,6 +423,15 @@ def anonymize_endpoint_ha(match)->str:
                 match.string[match.end(21) :],
             ]
         )
+        anonymized_line = "".join(
+            [
+                match.string[: match.start(9)],
+                anonymized_endpoint,
+                match.string[match.end(9) : match.start(21)],
+                anonymized_endpoint2,
+                match.string[match.end(21) :],
+            ]
+        )
 
     return anonymized_line
 
@@ -323,7 +441,14 @@ def anonymize_endpoint_line(line, filename)->str:
     Takes a specific line from a file and its filename.
     Returns the line with all of the endpoints anonymized.
 
+
     """
+    if "httpd" in filename:
+        line = re.sub(HTTPD_PATTERN, anonymize_endpoint_httpd, line)
+    elif "sshd" in filename:
+        line = re.sub(SSHD_PATTERN, anonymize_endpoint_sshd, line)
+    elif "ha" in filename:
+        line = re.sub(HA_PROXY_PATTERN, anonymize_endpoint_ha, line)
     if "httpd" in filename:
         line = re.sub(HTTPD_PATTERN, anonymize_endpoint_httpd, line)
     elif "sshd" in filename:
@@ -334,7 +459,11 @@ def anonymize_endpoint_line(line, filename)->str:
         line = re.sub(ENDPOINT_PATTERN, anonymize_endpoint_general, line)
     return line.rstrip() + "\n"
 
+        line = re.sub(ENDPOINT_PATTERN, anonymize_endpoint_general, line)
+    return line.rstrip() + "\n"
+
 
+def randomize_numbers(number, is_ip_address:bool=False)->str:
 def randomize_numbers(number, is_ip_address:bool=False)->str:
     """
     Takes a number and a flag to indicate if the given number is part of an IP address.
@@ -344,6 +473,8 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
     """
     num_len = len(str(number))
     lower_bound = 10 ** (num_len - 1)
+    if is_ip_address:
+    lower_bound = 10 ** (num_len - 1)
     if is_ip_address:
         upper_bound = min(255, (10**num_len - 1))
     else:
@@ -354,6 +485,15 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
     return random.randint(lower_bound, upper_bound)
 
 
+def count_lines(file_path:str)->int:
+    c = 0
+    with open(file_path) as file:
+        while True:
+            chunk = file.read(10 ** 7)
+            if chunk == "":
+                return c
+            c += chunk.count("\n")
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Anonymize log files. Anonymized files and their lookup tables are storedd in the `anonymized-logs` folder"
@@ -376,14 +516,36 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
     parser.add_argument(
         "--timestamps", action="store_true", help="Only Anonymize Timestamps"
     )
+    parser = argparse.ArgumentParser(
+        description="Anonymize log files. Anonymized files and their lookup tables are storedd in the `anonymized-logs` folder"
+    )
+    parser.add_argument(
+        "input_directory",
+        type=str,
+        help="Directory containing the log files to be anonymized",
+    )
+    parser.add_argument(
+        "output_directory",
+        type=str,
+        help="Directory where the anonymized files will be stored",
+    )
+    parser.add_argument("--ip", action="store_true", help="Only Anonymize IP Addresses")
+    parser.add_argument(
+        "--endpoint", action="store_true", help="Only Anonymize Endpoints"
+    )
+    parser.add_argument("--user", action="store_true", help="Only Anonymize User IDs")
+    parser.add_argument(
+        "--timestamps", action="store_true", help="Only Anonymize Timestamps"
+    )
 
     args = parser.parse_args()
 
+    base_directory = args.input_directory
     base_directory = args.input_directory
     output_directory = args.output_directory
     os.makedirs(output_directory, exist_ok=True)
 
-    for file_name in os.listdir(base_directory):
+    for file_name in tqdm.tqdm(os.listdir(base_directory), unit=' Files'):
         if not file_name.lower().endswith(tuple(FILES_TO_EXCLUDE)):
             input_file_path = os.path.join(base_directory, file_name)
             output_file_path = os.path.join(output_directory, f"anonymized_{file_name}")
@@ -391,25 +553,33 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
                 output_directory, f"lookup_table_{file_name}.txt"
             )
             lookup_table = {}
-
+            line_count = count_lines(file_path=file_name)
             with open(
                 file=input_file_path, mode="r", encoding="utf-8"
             ) as input_file, open(
                 file=output_file_path, mode="w", encoding="utf-8"
             ) as output_file:
-                for current_line in input_file:
+                for current_line in tqdm.tqdm(iterable=input_file,unit=' Lines', total=line_count):
                     if args.ip:
                         current_line = anonymize_ip_line(current_line, file_name)
+                        current_line = anonymize_ip_line(current_line, file_name)
                     if args.endpoint:
                         current_line = anonymize_endpoint_line(current_line, file_name)
+                        current_line = anonymize_endpoint_line(current_line, file_name)
                     if args.user:
                         current_line = anonymize_user_line(current_line, file_name)
+                        current_line = anonymize_user_line(current_line, file_name)
                     if args.timestamps:
                         re_match = re.match(TIMESTAMP_PATTERN, current_line)
                         if re_match:
                             new_ts = anonymize_timestamps(re_match)
                             if new_ts:
                                 current_line = re.sub(TIMESTAMP_PATTERN, new_ts, current_line)
+                        re_match = re.match(TIMESTAMP_PATTERN, current_line)
+                        if re_match:
+                            new_ts = anonymize_timestamps(re_match)
+                            if new_ts:
+                                current_line = re.sub(TIMESTAMP_PATTERN, new_ts, current_line)
                     if not (args.ip or args.timestamps or args.endpoint or args.user):
                         current_line = anonymize_ip_line(current_line, file_name)
                         current_line = anonymize_endpoint_line(current_line, file_name)
@@ -417,9 +587,20 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
 
                     output_file.write(current_line)
 
+            with open(file=lookup_file_path, mode="w", encoding="utf-8") as lookup_file:
+                        current_line = anonymize_ip_line(current_line, file_name)
+                        current_line = anonymize_endpoint_line(current_line, file_name)
+                        current_line = anonymize_user_line(current_line, file_name)
+
+                    output_file.write(current_line)
+
             with open(file=lookup_file_path, mode="w", encoding="utf-8") as lookup_file:
                 for original_data, anonymized_data in lookup_table.items():
                     lookup_file.write(f"{anonymized_data} -> {original_data}\n")
 
             print(f"Logs in {file_name} anonymized and saved to {output_file_path}")
             print(f"Lookup table for {file_name} saved to {lookup_file_path} \n")
+                    lookup_file.write(f"{anonymized_data} -> {original_data}\n")
+
+            print(f"Logs in {file_name} anonymized and saved to {output_file_path}")
+            print(f"Lookup table for {file_name} saved to {lookup_file_path} \n")
diff --git a/requirements.txt b/requirements.txt
index d3386a9..ae6dc97 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 namesgenerator==0.3
+tqdm==4.64.0

From e94806eaa5be7129545cd4bbaad05dd3e8a982ae Mon Sep 17 00:00:00 2001
From: Matthew Khouzam <matthew.khouzam@ericsson.com>
Date: Wed, 31 Jan 2024 18:05:37 -0500
Subject: [PATCH 5/5] Update log_anonymizer.py

Update count_lines
---
 log_anonymizer.py | 211 +++++-----------------------------------------
 1 file changed, 21 insertions(+), 190 deletions(-)

diff --git a/log_anonymizer.py b/log_anonymizer.py
index b872b85..927877f 100644
--- a/log_anonymizer.py
+++ b/log_anonymizer.py
@@ -9,7 +9,6 @@
 import random
 import re
 import argparse
-import argparse
 import namesgenerator
 import tqdm
 
@@ -23,20 +22,11 @@
 TIMESTAMP_PATTERN = r"(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})"
 ENDPOINT_PATTERN = r"\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?"
 USER_ID_PATTERN = r"\s[a-z][a-z0-9]{4,19}\s"
-IP_PATTERN = r"\b\d{1,3}(?:\.\d{1,3}){2,}\b"
-TIMESTAMP_PATTERN = r"(\d{4}:\d{2}:\d{2}:\d{2})|(\d{2}:\d{2}:\d{2}[,\.]\d{3})"
-ENDPOINT_PATTERN = r"\b[^\/\s]+\/(?![0-9])[^\/\s]+(?:\/(?![0-9])[^\/\s]+)?"
-USER_ID_PATTERN = r"\s[a-z][a-z0-9]{4,19}\s"
 
 # Specific Regex Patterns
 HTTPD_PATTERN = r"\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\""
 SSHD_PATTERN = r"\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))"
 HA_PROXY_PATTERN = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
-HTTPD_PATTERN = r"\s*(\S+)\s*(\S*)\s*\-\s(\S+)\s\[(\S+\s*\S+)\]\s\"(\S+)\s+(\S+)\s(\S+)\"\s(\d+)\s(\S+)\s(\S+)\s\"(.*)\""
-SSHD_PATTERN = r"\[(\d.*)\]\s+(\S+)\s+(\S+)\s+(\S+)\s+((([A-Z]+)\sFROM\s+(.*))|(([A-Z]+))|((AUTH FAILURE)\sFROM\s(\S+)\s(.*))|((.+)\s+(\S+)\s+(\S+)\s+(\S+)))"
-HA_PROXY_PATTERN = r'^(\w+ \d+ \S+) (\S+) (\S+)\[(\d+)\]: (\S+):(\d+) \[(\S+)\] (\S+) (\S+) (\S+) (\S+) (\S+) *(\S+) (\S+) (\S+)(?: (\S+) (\S+) \{([^}]*)\} \{([^}]*)\} "(\S+) ([^"]+) (\S+)")? *$'
-
-FILES_TO_EXCLUDE = [".gz", ".md5", ".sha1", ".sha256", ".zip"]
 
 FILES_TO_EXCLUDE = [".gz", ".md5", ".sha1", ".sha256", ".zip"]
 
@@ -47,11 +37,10 @@ def anonymize_ip(matched_pattern) -> str:
     Returns an anonymized IP address.
 
     """
-    original_ip = matched_patterned_pattern.group(0)
+    original_ip = matched_pattern.group(0)
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
-        ip_parts = original_ip.split(".")
         ip_parts = original_ip.split(".")
         anonymized_ip_parts = []
         for part in ip_parts:
@@ -69,13 +58,12 @@ def anonymize_ip_ha(matched_pattern) -> str:
     Returns a line with all of the IP addresses and their ports anonymized.
 
     """
-    original_ip = matched_patterned_pattern.group(5)
-    original_port = matched_patterned_pattern.group(6)
+    original_ip = matched_pattern.group(5)
+    original_port = matched_pattern.group(6)
 
     if original_ip in lookup_table:
         anonymized_ip = lookup_table[original_ip]
     else:
-        ip_parts = original_ip.split(".")
         ip_parts = original_ip.split(".")
         anonymized_ip_parts = [str(randomize_numbers(part, True)) for part in ip_parts]
         anonymized_ip = ".".join(anonymized_ip_parts)
@@ -97,36 +85,19 @@ def anonymize_ip_ha(matched_pattern) -> str:
         ]
     )
 
-    return "".join(
-        [
-            matched_pattern.string[: matched_pattern.start(5)],
-            anonymized_ip,
-            matched_pattern.string[matched_pattern.end(5) : matched_pattern.start(6)],
-            anonymized_port,
-            matched_pattern.string[matched_pattern.end(6) :],
-        ]
-    )
-
 
 def anonymize_ip_line(line: str, filename: str) -> str:
     """
     Takes a specific line from a file and its filename.
-    Takes a specific line from a file and its filename.
     Returns the line with all of the IP addresses anonymized.
 
-
     """
-    if "ha" in filename:
-        line = re.sub(HA_PROXY_PATTERN, anonymize_ip_ha, line)
     if "ha" in filename:
         line = re.sub(HA_PROXY_PATTERN, anonymize_ip_ha, line)
     else:
         line = re.sub(IP_PATTERN, anonymize_ip, line)
 
     return line.rstrip() + "\n"
-        line = re.sub(IP_PATTERN, anonymize_ip, line)
-
-    return line.rstrip() + "\n"
 
 
 def anonymize_timestamps(matched_pattern) -> str:
@@ -134,17 +105,12 @@ def anonymize_timestamps(matched_pattern) -> str:
     Takes a regex match, representing a line in any type of log file.
     Returns an anonymized timestamp that has the same length as the original one.
     This will break monotonicity.
-    This will break monotonicity.
 
     """
     original_timestamp = matched_pattern.group(0)
-    if original_timestamp in lookup_table:
-        anonymized_ip = lookup_table[original_timestamp]
-    original_timestamp = matched_pattern.group(0)
     if original_timestamp in lookup_table:
         anonymized_ip = lookup_table[original_timestamp]
     else:
-        ip_parts = original_timestamp.split(":")
         ip_parts = original_timestamp.split(":")
         anonymized_ip_parts = []
         for part in ip_parts:
@@ -152,7 +118,6 @@ def anonymize_timestamps(matched_pattern) -> str:
             anonymized_ip_parts.append(anonymized_part)
         anonymized_ip = ":".join(anonymized_ip_parts)
         lookup_table[original_timestamp] = anonymized_ip
-        lookup_table[original_timestamp] = anonymized_ip
 
     return anonymized_ip
 
@@ -168,7 +133,6 @@ def anonymize_user_id(user_id:str) ->str:
         anonymized_user_id = namesgenerator.get_random_name()
         lookup_table[user_id] = anonymized_user_id
 
-
     return anonymized_user_id
 
 
@@ -178,13 +142,11 @@ def anonymize_user_id_general(matched_pattern:str)->str:
     Returns a randomly generated user id.
 
     """
-    original_user_id = matched_patterned_pattern.group(0)
+    original_user_id = matched_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
 
     return " " + anonymized_user_id + " "
 
-    return " " + anonymized_user_id + " "
-
 
 def anonymize_user_id_httpd_sshd(matched_pattern)->str:
     """
@@ -193,9 +155,6 @@ def anonymize_user_id_httpd_sshd(matched_pattern)->str:
 
     """
     original_user_id = matched_pattern.group(3)
-    if original_user_id == "-":
-        return matched_pattern.group(0)
-    original_user_id = matched_pattern.group(3)
     if original_user_id == "-":
         return matched_pattern.group(0)
     anonymized_user_id = anonymize_user_id(original_user_id)
@@ -208,14 +167,6 @@ def anonymize_user_id_httpd_sshd(matched_pattern)->str:
         ]
     )
 
-    return "".join(
-        [
-            matched_pattern.string[: matched_pattern.start(3)],
-            anonymized_user_id,
-            matched_pattern.string[matched_pattern.end(3) :],
-        ]
-    )
-
 
 def anonymize_sensitive_info_ha(matched_pattern)->str:
     """
@@ -226,11 +177,7 @@ def anonymize_sensitive_info_ha(matched_pattern)->str:
     # Validate that sensitive information is present
     if not (matched_pattern.group(18) and matched_pattern.group(19)):
         return matched_pattern.group(0)
-    if not (matched_pattern.group(18) and matched_pattern.group(19)):
-        return matched_pattern.group(0)
 
-    original_info1 = matched_pattern.group(18)
-    original_info2 = matched_pattern.group(19)
     original_info1 = matched_pattern.group(18)
     original_info2 = matched_pattern.group(19)
     anonymized_info1 = anonymize_user_id(original_info1)
@@ -246,31 +193,13 @@ def anonymize_sensitive_info_ha(matched_pattern)->str:
         ]
     )
 
-    return "".join(
-        [
-            matched_pattern.string[: matched_pattern.start(18)],
-            anonymized_info1,
-            matched_pattern.string[matched_pattern.end(18) : matched_pattern.start(19)],
-            anonymized_info2,
-            matched_pattern.string[matched_pattern.end(19) :],
-        ]
-    )
-
 
 def anonymize_user_line(line:str, filename:str)->str:
     """
     Takes a specific line from a file and its filename.
-    Takes a specific line from a file and its filename.
     Returns the line with all of the user information and sensitive information anonymized.
 
-
     """
-    if "httpd" in filename:
-        line = re.sub(HTTPD_PATTERN, anonymize_user_id_httpd_sshd, line)
-    elif "sshd" in filename:
-        line = re.sub(SSHD_PATTERN, anonymize_user_id_httpd_sshd, line)
-    elif "ha" in filename:
-        line = re.sub(HA_PROXY_PATTERN, anonymize_sensitive_info_ha, line)
     if "httpd" in filename:
         line = re.sub(HTTPD_PATTERN, anonymize_user_id_httpd_sshd, line)
     elif "sshd" in filename:
@@ -281,9 +210,6 @@ def anonymize_user_line(line:str, filename:str)->str:
         line = re.sub(USER_ID_PATTERN, anonymize_user_id_general, line)
 
     return line.rstrip() + "\n"
-        line = re.sub(USER_ID_PATTERN, anonymize_user_id_general, line)
-
-    return line.rstrip() + "\n"
 
 
 def anonymize_endpoint(original_endpoint)->str:
@@ -292,25 +218,19 @@ def anonymize_endpoint(original_endpoint)->str:
 
     """
     endpoint_parts = original_endpoint.strip("/").split("/")
-    endpoint_parts = original_endpoint.strip("/").split("/")
     anonymized_parts = []
     for part in endpoint_parts:
         if part in lookup_table:
             anonymized_part = lookup_table[part]
-        else:
         else:
             anonymized_part = namesgenerator.get_random_name()
             lookup_table[part] = anonymized_part
         anonymized_parts.append(anonymized_part)
 
-    if original_endpoint.startswith("/"):
-        anonymized_endpoint = "/" + "/".join(anonymized_parts)
-
     if original_endpoint.startswith("/"):
         anonymized_endpoint = "/" + "/".join(anonymized_parts)
     else:
         anonymized_endpoint = "/".join(anonymized_parts)
-        anonymized_endpoint = "/".join(anonymized_parts)
 
     return anonymized_endpoint
 
@@ -319,6 +239,7 @@ def anonymize_endpoint_general(match)->str:
     """
     Takes a regex match, representing a line in any type of log file.
     Returns the anonymized endpoint.
+
     """
     original_endpoint = match.group(0)
     return anonymize_endpoint(original_endpoint)
@@ -329,7 +250,6 @@ def anonymize_endpoint_httpd(match)->str:
     Takes a regex match, representing a line in an HTTP log file.
     Returns the anonymized endpoint.
 
-
     """
     original_endpoint = match.group(6)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
@@ -342,42 +262,23 @@ def anonymize_endpoint_httpd(match)->str:
         ]
     )
 
-    return "".join(
-        [
-            match.string[: match.start(6)],
-            anonymized_endpoint,
-            match.string[match.end(6) :],
-        ]
-    )
-
 
 def anonymize_endpoint_sshd(match)->str:
     """
     Takes a regex match, representing a line in an SSH log file.
     Returns the anonymized endpoint.
+
     """
     original_endpoint = match.group(5)
     parts = original_endpoint.split(" ", 1)
     endpoint_part = parts[0].strip("/").split("/")
     remaining_string = parts[1] if len(parts) > 1 else ""
-    parts = original_endpoint.split(" ", 1)
-    endpoint_part = parts[0].strip("/").split("/")
-    remaining_string = parts[1] if len(parts) > 1 else ""
 
     if len(endpoint_part) < 2:
         return match.group(0)
 
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
     anonymized_endpoint += " " + remaining_string if remaining_string else ""
-    anonymized_endpoint += " " + remaining_string if remaining_string else ""
-
-    return "".join(
-        [
-            match.string[: match.start(5)],
-            anonymized_endpoint,
-            match.string[match.end(5) :],
-        ]
-    )
 
     return "".join(
         [
@@ -392,6 +293,7 @@ def anonymize_endpoint_ha(match)->str:
     """
     Takes a regex match, representing a line in an HA Proxy log file.
     Returns a line with all of the endpoints anonymized.
+
     """
     original_endpoint = match.group(9)
     anonymized_endpoint = anonymize_endpoint(original_endpoint)
@@ -402,15 +304,7 @@ def anonymize_endpoint_ha(match)->str:
             match.string[match.end(9) :],
         ]
     )
-    anonymized_line = "".join(
-        [
-            match.string[: match.start(9)],
-            anonymized_endpoint,
-            match.string[match.end(9) :],
-        ]
-    )
 
-    if match.group(21):
     if match.group(21):
         original_endpoint2 = match.group(21)
         anonymized_endpoint2 = anonymize_endpoint(original_endpoint2)
@@ -423,15 +317,6 @@ def anonymize_endpoint_ha(match)->str:
                 match.string[match.end(21) :],
             ]
         )
-        anonymized_line = "".join(
-            [
-                match.string[: match.start(9)],
-                anonymized_endpoint,
-                match.string[match.end(9) : match.start(21)],
-                anonymized_endpoint2,
-                match.string[match.end(21) :],
-            ]
-        )
 
     return anonymized_line
 
@@ -441,14 +326,7 @@ def anonymize_endpoint_line(line, filename)->str:
     Takes a specific line from a file and its filename.
     Returns the line with all of the endpoints anonymized.
 
-
     """
-    if "httpd" in filename:
-        line = re.sub(HTTPD_PATTERN, anonymize_endpoint_httpd, line)
-    elif "sshd" in filename:
-        line = re.sub(SSHD_PATTERN, anonymize_endpoint_sshd, line)
-    elif "ha" in filename:
-        line = re.sub(HA_PROXY_PATTERN, anonymize_endpoint_ha, line)
     if "httpd" in filename:
         line = re.sub(HTTPD_PATTERN, anonymize_endpoint_httpd, line)
     elif "sshd" in filename:
@@ -459,11 +337,7 @@ def anonymize_endpoint_line(line, filename)->str:
         line = re.sub(ENDPOINT_PATTERN, anonymize_endpoint_general, line)
     return line.rstrip() + "\n"
 
-        line = re.sub(ENDPOINT_PATTERN, anonymize_endpoint_general, line)
-    return line.rstrip() + "\n"
-
 
-def randomize_numbers(number, is_ip_address:bool=False)->str:
 def randomize_numbers(number, is_ip_address:bool=False)->str:
     """
     Takes a number and a flag to indicate if the given number is part of an IP address.
@@ -473,8 +347,6 @@ def randomize_numbers(number, is_ip_address:bool=False)->str:
     """
     num_len = len(str(number))
     lower_bound = 10 ** (num_len - 1)
-    if is_ip_address:
-    lower_bound = 10 ** (num_len - 1)
     if is_ip_address:
         upper_bound = min(255, (10**num_len - 1))
     else:
@@ -516,44 +388,22 @@ def count_lines(file_path:str)->int:
     parser.add_argument(
         "--timestamps", action="store_true", help="Only Anonymize Timestamps"
     )
-    parser = argparse.ArgumentParser(
-        description="Anonymize log files. Anonymized files and their lookup tables are storedd in the `anonymized-logs` folder"
-    )
-    parser.add_argument(
-        "input_directory",
-        type=str,
-        help="Directory containing the log files to be anonymized",
-    )
-    parser.add_argument(
-        "output_directory",
-        type=str,
-        help="Directory where the anonymized files will be stored",
-    )
-    parser.add_argument("--ip", action="store_true", help="Only Anonymize IP Addresses")
-    parser.add_argument(
-        "--endpoint", action="store_true", help="Only Anonymize Endpoints"
-    )
-    parser.add_argument("--user", action="store_true", help="Only Anonymize User IDs")
-    parser.add_argument(
-        "--timestamps", action="store_true", help="Only Anonymize Timestamps"
-    )
 
     args = parser.parse_args()
 
-    base_directory = args.input_directory
     base_directory = args.input_directory
     output_directory = args.output_directory
     os.makedirs(output_directory, exist_ok=True)
 
-    for file_name in tqdm.tqdm(os.listdir(base_directory), unit=' Files'):
-        if not file_name.lower().endswith(tuple(FILES_TO_EXCLUDE)):
-            input_file_path = os.path.join(base_directory, file_name)
-            output_file_path = os.path.join(output_directory, f"anonymized_{file_name}")
+    for input_file_path in tqdm.tqdm(os.listdir(base_directory), unit=' Files'):
+        if not input_file_path.lower().endswith(tuple(FILES_TO_EXCLUDE)):
+            input_file_path = os.path.join(base_directory, input_file_path)
+            output_file_path = os.path.join(output_directory, f"anonymized_{input_file_path}")
             lookup_file_path = os.path.join(
-                output_directory, f"lookup_table_{file_name}.txt"
+                output_directory, f"lookup_table_{input_file_path}.txt"
             )
             lookup_table = {}
-            line_count = count_lines(file_path=file_name)
+            line_count = count_lines(file_path=input_file_path)
             with open(
                 file=input_file_path, mode="r", encoding="utf-8"
             ) as input_file, open(
@@ -561,36 +411,21 @@ def count_lines(file_path:str)->int:
             ) as output_file:
                 for current_line in tqdm.tqdm(iterable=input_file,unit=' Lines', total=line_count):
                     if args.ip:
-                        current_line = anonymize_ip_line(current_line, file_name)
-                        current_line = anonymize_ip_line(current_line, file_name)
+                        current_line = anonymize_ip_line(current_line, input_file_path)
                     if args.endpoint:
-                        current_line = anonymize_endpoint_line(current_line, file_name)
-                        current_line = anonymize_endpoint_line(current_line, file_name)
+                        current_line = anonymize_endpoint_line(current_line, input_file_path)
                     if args.user:
-                        current_line = anonymize_user_line(current_line, file_name)
-                        current_line = anonymize_user_line(current_line, file_name)
+                        current_line = anonymize_user_line(current_line, input_file_path)
                     if args.timestamps:
                         re_match = re.match(TIMESTAMP_PATTERN, current_line)
                         if re_match:
                             new_ts = anonymize_timestamps(re_match)
                             if new_ts:
                                 current_line = re.sub(TIMESTAMP_PATTERN, new_ts, current_line)
-                        re_match = re.match(TIMESTAMP_PATTERN, current_line)
-                        if re_match:
-                            new_ts = anonymize_timestamps(re_match)
-                            if new_ts:
-                                current_line = re.sub(TIMESTAMP_PATTERN, new_ts, current_line)
                     if not (args.ip or args.timestamps or args.endpoint or args.user):
-                        current_line = anonymize_ip_line(current_line, file_name)
-                        current_line = anonymize_endpoint_line(current_line, file_name)
-                        current_line = anonymize_user_line(current_line, file_name)
-
-                    output_file.write(current_line)
-
-            with open(file=lookup_file_path, mode="w", encoding="utf-8") as lookup_file:
-                        current_line = anonymize_ip_line(current_line, file_name)
-                        current_line = anonymize_endpoint_line(current_line, file_name)
-                        current_line = anonymize_user_line(current_line, file_name)
+                        current_line = anonymize_ip_line(current_line, input_file_path)
+                        current_line = anonymize_endpoint_line(current_line, input_file_path)
+                        current_line = anonymize_user_line(current_line, input_file_path)
 
                     output_file.write(current_line)
 
@@ -598,9 +433,5 @@ def count_lines(file_path:str)->int:
                 for original_data, anonymized_data in lookup_table.items():
                     lookup_file.write(f"{anonymized_data} -> {original_data}\n")
 
-            print(f"Logs in {file_name} anonymized and saved to {output_file_path}")
-            print(f"Lookup table for {file_name} saved to {lookup_file_path} \n")
-                    lookup_file.write(f"{anonymized_data} -> {original_data}\n")
-
-            print(f"Logs in {file_name} anonymized and saved to {output_file_path}")
-            print(f"Lookup table for {file_name} saved to {lookup_file_path} \n")
+            print(f"Logs in {input_file_path} anonymized and saved to {output_file_path}")
+            print(f"Lookup table for {input_file_path} saved to {lookup_file_path} \n")