From 5875884b65a647e1c12c52cf1fe6f3468944ed51 Mon Sep 17 00:00:00 2001
From: AngelSPT <angelept180504@gmail.com>
Date: Sun, 1 Jun 2025 23:00:03 -0400
Subject: [PATCH] feat(data_structures): Add Suffix Array algorithm

---
 data_structures/suffix_array/__init__.py     |   0
 data_structures/suffix_array/suffix_array.py | 171 +++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 data_structures/suffix_array/__init__.py
 create mode 100644 data_structures/suffix_array/suffix_array.py

diff --git a/data_structures/suffix_array/__init__.py b/data_structures/suffix_array/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/data_structures/suffix_array/suffix_array.py b/data_structures/suffix_array/suffix_array.py
new file mode 100644
index 000000000000..139b6ae2f0c3
--- /dev/null
+++ b/data_structures/suffix_array/suffix_array.py
@@ -0,0 +1,171 @@
+"""
+Implementation of the Suffix Array construction algorithm in Python.
+
+This algorithm takes a text string as input and produces its Suffix Array.
+A Suffix Array is a sorted array of all suffixes of a given string.
+It is a data structure used in, among others, bioinformatics and data compression.
+"""
+
+
+def build_suffix_array(text: str) -> list[int]:
+    """
+    Builds the Suffix Array for a given text string.
+
+    The construction involves:
+    1. Generating all suffixes of the string.
+    2. Storing each suffix along with its original starting index.
+    3. Sorting these (suffix, index) pairs lexicographically based on the suffix.
+    4. Extracting the indices into a list, which is the Suffix Array.
+
+    Args:
+        text: The input text string. It's common to append a special
+              character (lexicographically smallest, like '$') to the end
+              of the string to ensure all suffixes are unique and to
+              simplify certain suffix array algorithms, though this
+              implementation will work without it too by relying on Python's
+              string comparison. For canonical behavior, consider appending it.
+
+    Returns:
+        list[int]: The Suffix Array, which is a list of starting
+                   indices of sorted suffixes.
+
+    Raises:
+        TypeError: If the input is not a string.
+
+    Examples:
+        >>> build_suffix_array("banana") # Using "banana" without a special end char
+        [5, 3, 1, 0, 4, 2]
+        Suffixes:
+        "a" (5)
+        "ana" (3)
+        "anana" (1)
+        "banana" (0)
+        "na" (4)
+        "nana" (2)
+
+        >>> build_suffix_array("banana$")
+        [6, 5, 3, 1, 0, 4, 2]
+        Suffixes:
+        "$" (6)
+        "a$" (5)
+        "ana$" (3)
+        "anana$" (1)
+        "banana$" (0)
+        "na$" (4)
+        "nana$" (2)
+
+        >>> build_suffix_array("abracadabra")
+        [10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]
+
+        >>> build_suffix_array("")
+        []
+
+        >>> build_suffix_array("aaa")
+        [2, 1, 0]  (or any order of 0,1,2 if suffixes are identical like "a", "a", "a")
+        Python's sort is stable, so for identical suffixes,
+        the one with larger original index comes later if we consider '$' implicitly.
+        If we list them: "a" (2), "aa" (1), "aaa" (0)
+        Sorted by suffix: "a", "aa", "aaa" -> indices [2, 1, 0]
+    """
+    if not isinstance(text, str):
+        raise TypeError("Input must be a string.")
+
+    n = len(text)
+    if n == 0:
+        return []
+
+    # 1. Generate all suffixes and store them with their original starting indices.
+    #    A suffix is defined by its starting position in the original text.
+    #    Example: text = "banana"
+    #    Suffixes are:
+    #    (0, "banana")
+    #    (1, "anana")
+    #    (2, "nana")
+    #    (3, "ana")
+    #    (4, "na")
+    #    (5, "a")
+
+    suffixes = []
+    for i in range(n):
+        suffixes.append((text[i:], i))  # Store (suffix_string, original_index)
+
+    # 2. Sort the (suffix, index) pairs.
+    #    Python's default sort for tuples will sort based on the first element
+    #    (the suffix string), and then by the second element (the index) if
+    #    suffixes are identical. This lexicographical sort is the core of
+    #    Suffix Array construction. The sort is stable, meaning if two suffixes
+    #    are identical (which shouldn't happen if a unique terminator like '$'
+    #    is used), their relative order base on original index would be preserved
+    #    if that was a secondary sort key. Here, we just need to sort by the suffix
+    #    string. suffixes.sort(key=lambda x: x[0])
+
+    # 3. Extract the indices into a list.
+    #    This list of sorted indices is the Suffix Array.
+    suffix_array = [item[1] for item in suffixes]
+
+    return suffix_array
+
+
+def print_suffixes_and_array(text: str, sa: list[int]):
+    """Helper function to print suffixes in sorted order along with their indices."""
+    if not sa:
+        print("  (Empty string has no suffixes)")
+        return
+    print("  Sorted Suffixes (index: suffix):")
+    for i in sa:
+        print(f"    {i}: {text[i:]}")
+    print(f"  Suffix Array: {sa}")
+
+
+def main():
+    """
+    Main function to demonstrate Suffix Array construction.
+    """
+    print("### Suffix Array Construction Demonstration ###\n")
+
+    test_cases = [
+        "banana",
+        "banana$",  # With a unique terminator
+        "abracadabra",
+        "mississippi",
+        "GATTACA",
+        "aaaaa",
+        "abcde",
+        "",  # Empty string
+    ]
+
+    for text_to_process in test_cases:
+        print(f'Original string: "{text_to_process}"')
+        try:
+            suffix_arr = build_suffix_array(text_to_process)
+            print_suffixes_and_array(text_to_process, suffix_arr)
+            print("")  # Newline for better readability
+        except TypeError as e:
+            print(f"  Error: {e}\n")
+
+    # Example with user input
+    print("--- Test with user input ---")
+    try:
+        user_input = input(
+            "Enter a string to build its Suffix Array (e.g., 'banana'): "
+        )
+        # It's good practice to suggest adding '$' if needed for specific use cases
+        # print("(Consider adding a unique character like '$' to the end if not
+        # present)")
+        if (
+            user_input is not None
+        ):  # Check if input is not None (Ctrl+D might give None)
+            sa_output = build_suffix_array(user_input)
+            print_suffixes_and_array(user_input, sa_output)
+        else:
+            print("  No string entered.")  # Should not happen with input() unless EOF
+    except TypeError as e:
+        print(f"  Error: {e}")
+    except EOFError:  # Handles Ctrl+D
+        print("\n  Input cancelled.")
+    except KeyboardInterrupt:
+        print("\n  Process interrupted by user.")
+
+
+if __name__ == "__main__":
+    main()