From 5a7f273605d6f7ccd77501365a7f9071bc8be7c0 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sun, 22 Dec 2024 17:33:32 +0600
Subject: [PATCH 01/13] issue 523 done and in check_lexeme_dump_prompt_download
 added select for better view

---
 src/scribe_data/cli/main.py | 33 ++++++++++++++++
 src/scribe_data/utils.py    | 25 ++++++++----
 tests/cli/test_download.py  | 76 +++++++++++++++++++++++++------------
 3 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index d4c49f6e..ea8af9c5 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -25,6 +25,7 @@
 from pathlib import Path
 
 from rich import print as rprint
+from questionary import select
 
 from scribe_data.cli.cli_utils import validate_language_and_data_type
 from scribe_data.cli.convert import convert_wrapper
@@ -303,6 +304,16 @@ def main() -> None:
         help="The output directory path for the downloaded dump.",
     )
 
+    # MARK: Interactive
+
+    interactive_parser = subparsers.add_parser(
+        "interactive",
+        aliases=["i"],
+        help="Run in interactive mode.",
+        description="Run in interactive mode.",
+    )
+    interactive_parser._actions[0].help = "Show this help message and exit."
+
     # MARK: Setup CLI
 
     args = parser.parse_args()
@@ -402,6 +413,28 @@ def main() -> None:
                 output_dir=args.output_dir,
             )
 
+        elif args.command in ["interactive", "i"]:
+            action = select(
+                "What would you like to do?",
+                choices=[
+                    "Download a Wikidata dump",
+                    "Check for totals",
+                    "Get data",
+                    "Get translations",
+                    "Exit",
+                ],
+            ).ask()
+
+            if action == "Download a Wikidata dump":
+                wd_lexeme_dump_download_wrapper()
+            elif action == "Check for totals":
+                start_interactive_mode(operation="total")
+            elif action == "Get data":
+                start_interactive_mode(operation="get")
+            elif action == "Get translations":
+                print("Coming soon!")
+            else:
+                print("Skipping action")
         else:
             parser.print_help()
 
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 933c7623..6f615444 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -31,6 +31,7 @@
 from typing import Any, Optional
 
 from rich import print as rprint
+from questionary import select
 
 # MARK: Utils Variables
 
@@ -649,19 +650,27 @@ def check_lexeme_dump_prompt_download(output_dir: str):
         for dump in existing_dumps:
             rprint(f"  - {Path(output_dir)}/{dump.name}")
 
-        user_input = input(
-            "\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n - Download new version(n)?\n[d/s/u/n]: "
-        ).lower()
-
-        if user_input == "d":
+        user_input = select(
+            "Do you want to:",
+            choices=[
+                "Delete existing dumps",
+                "Skip download",
+                "Use existing latest dump",
+                "Download new version",
+            ],
+        ).ask()
+
+        if user_input.startswith("Delete"):
             for dump in existing_dumps:
                 dump.unlink()
 
             rprint("[bold green]Existing dumps deleted.[/bold green]")
-            user_input = input("Do you want to download latest lexeme dump? (y/N): ")
-            return user_input != "y"
+            download_input = select(
+                "Do you want to download the latest lexeme dump?", choices=["Yes", "No"]
+            ).ask()
+            return download_input != "Yes"
 
-        elif user_input == "u":
+        elif user_input.startswith("Use"):
             # Check for the latest dump file.
             latest_dump = None
             if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py
index 5dfa5830..c77c524a 100644
--- a/tests/cli/test_download.py
+++ b/tests/cli/test_download.py
@@ -101,15 +101,23 @@ def test_download_wd_lexeme_dump_by_date(self, mock_findall, mock_get):
         )
 
     @patch("scribe_data.cli.download.requests.get")
-    @patch("scribe_data.cli.download.input", return_value="y")
     @patch(
-        "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=None
+        "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=False
     )
     @patch("scribe_data.cli.download.open", new_callable=mock_open)
     @patch("scribe_data.cli.download.tqdm")
-    @patch("scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir")
+    @patch("scribe_data.cli.download.os.makedirs")
+    @patch(
+        "scribe_data.cli.download.input", return_value="y"
+    )  # Mocking input to return 'y'
     def test_wd_lexeme_dump_download_wrapper_latest(
-        self, mock_tqdm, mock_file, mock_check_prompt, mock_input, mock_get
+        self,
+        mock_input,
+        mock_makedirs,
+        mock_tqdm,
+        mock_file,
+        mock_check_prompt,
+        mock_get,
     ):
         """
         Test wrapper function for downloading latest Wikidata lexeme dump.
@@ -119,35 +127,53 @@ def test_wd_lexeme_dump_download_wrapper_latest(
         mock_get.return_value.headers = {"content-length": "100"}
         mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10
 
-        with patch("scribe_data.cli.download.os.makedirs") as mock_makedirs:
+        # Mock DEFAULT_DUMP_EXPORT_DIR
+        with patch(
+            "scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir"
+        ):
             download_path = wd_lexeme_dump_download_wrapper()
+            self.assertIsNotNone(download_path, "Download path should not be None")
             self.assertIn("latest-lexemes.json.bz2", download_path)
             mock_makedirs.assert_called_with("test_export_dir", exist_ok=True)
+            mock_input.assert_called_with(
+                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities.\nDo you want to proceed? (y/n): "
+            )
 
-    def test_check_lexeme_dump_prompt_download_existing(self):
+    @patch("scribe_data.utils.select")
+    @patch(
+        "scribe_data.utils.Path.glob",
+        return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")],
+    )
+    def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select):
         """
         Test prompt for using existing lexeme dump files.
         """
-        with patch(
-            "scribe_data.utils.Path.glob",
-            return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")],
-        ):
-            with patch("builtins.input", return_value="u"):
-                result = check_lexeme_dump_prompt_download(
-                    "scribe_data/tests/cli/test_export_dir"
-                )
-                self.assertEqual(result.name, "latest-lexemes.json.bz2")
+        # Mock the select dialog to return "Use existing latest dump"
+        mock_select.return_value.ask.return_value = "Use existing latest dump"
+
+        result = check_lexeme_dump_prompt_download(
+            "scribe_data/tests/cli/test_export_dir"
+        )
+        self.assertEqual(result.name, "latest-lexemes.json.bz2")
 
-    def test_check_lexeme_dump_prompt_download_delete(self):
+    @patch("scribe_data.utils.select")
+    @patch(
+        "scribe_data.utils.Path.glob",
+        return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")],
+    )
+    def test_check_lexeme_dump_prompt_download_delete(self, mock_glob, mock_select):
         """
         Test prompt for deleting existing lexeme dump files.
         """
-        mock_existing_files = [Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")]
-        with patch("scribe_data.utils.Path.glob", return_value=mock_existing_files):
-            with patch("builtins.input", side_effect=["d", "n"]):
-                with patch("scribe_data.utils.Path.unlink") as mock_unlink:
-                    result = check_lexeme_dump_prompt_download(
-                        "scribe_data/tests/cli/test_export_dir"
-                    )
-                    self.assertTrue(mock_unlink.called)
-                    self.assertTrue(result)
+        # Configure the mock to return "Delete existing dumps" first and then "No"
+        mock_select.side_effect = [
+            MagicMock(ask=MagicMock(return_value="Delete existing dumps")),
+            MagicMock(ask=MagicMock(return_value="No")),
+        ]
+
+        with patch("scribe_data.utils.Path.unlink") as mock_unlink:
+            result = check_lexeme_dump_prompt_download(
+                "scribe_data/tests/cli/test_export_dir"
+            )
+            self.assertTrue(mock_unlink.called)
+            self.assertTrue(result)

From d51549c95ac51fc981adb22371ff8d700a5a31b4 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Wed, 25 Dec 2024 19:20:06 +0600
Subject: [PATCH 02/13] added scribe-data get -l bengali -dt translations -wdp
 mama -od monu

---
 .gitignore                                    |   1 +
 src/scribe_data/cli/download.py               |  15 +-
 src/scribe_data/cli/get.py                    |  20 +-
 src/scribe_data/cli/main.py                   |   6 +-
 .../resources/data_type_metadata.json         |   1 +
 src/scribe_data/utils.py                      |   1 -
 src/scribe_data/wikidata/wikidata_utils.py    |  36 ++-
 src/scribe_data/wiktionary/parse_dump.py      | 236 ++++++++++++++++++
 tests/cli/test_download.py                    |  12 +-
 tests/cli/test_get.py                         |  12 +-
 10 files changed, 302 insertions(+), 38 deletions(-)
 create mode 100644 src/scribe_data/wiktionary/parse_dump.py

diff --git a/.gitignore b/.gitignore
index 610b9da8..0f860902 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,4 @@ scribe_data_tsv_export/*
 # MARK: Wiki Dumps
 
 *.json.bz2
+*.log
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
index 2f741545..4ce478e0 100644
--- a/src/scribe_data/cli/download.py
+++ b/src/scribe_data/cli/download.py
@@ -30,6 +30,7 @@
 import requests
 from rich import print as rprint
 from tqdm import tqdm
+import questionary
 
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
 
@@ -244,16 +245,12 @@ def wd_lexeme_dump_download_wrapper(
         filename = dump_url.split("/")[-1]
         output_path = str(Path(output_dir) / filename)
 
-        user_response = (
-            input(
-                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
-                "\nDo you want to proceed? (y/n): "
-            )
-            .strip()
-            .lower()
-        )
+        user_response = questionary.confirm(
+            "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities. Do you want to proceed?",
+            default=True,
+        ).ask()
 
-        if user_response == "y":
+        if user_response:
             rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
 
             response = requests.get(dump_url, stream=True)
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index a32b77fe..1aedb8dd 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -26,6 +26,7 @@
 from typing import List, Union
 
 from rich import print as rprint
+import questionary
 
 from scribe_data.cli.convert import convert_wrapper
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
@@ -111,10 +112,10 @@ def prompt_user_download_all():
         """
         Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call.
         """
-        download_all_input = input(
-            "Do you want to query Wikidata, or would you rather use Wikidata lexeme dumps? (y/N): "
-        )
-        return download_all_input == "y"
+        return questionary.confirm(
+            "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)",
+            default=False,
+        ).ask()
 
     if all:
         if language:
@@ -164,6 +165,17 @@ def prompt_user_download_all():
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
         generate_emoji(language=language, output_dir=output_dir)
 
+    # MARK: Translations
+
+    elif data_type == "translations":
+        parse_wd_lexeme_dump(
+            language=language,
+            wikidata_dump_type="translations",
+            type_output_dir=output_dir,
+            wikidata_dump_path=wikidata_dump,
+        )
+        return
+
     # MARK: Query Data
 
     elif language or data_type:
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index ea8af9c5..e2a7bb94 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -298,8 +298,8 @@ def main() -> None:
         help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.",
     )
     download_parser.add_argument(
-        "-od",
-        "--output-dir",
+        "-wdp",
+        "--wikidata-dump-path",
         type=str,
         help="The output directory path for the downloaded dump.",
     )
@@ -410,7 +410,7 @@ def main() -> None:
                 wikidata_dump=args.wikidata_dump_version
                 if args.wikidata_dump_version != "latest"
                 else None,
-                output_dir=args.output_dir,
+                output_dir=args.wikidata_dump_path,
             )
 
         elif args.command in ["interactive", "i"]:
diff --git a/src/scribe_data/resources/data_type_metadata.json b/src/scribe_data/resources/data_type_metadata.json
index ff6249f1..4800b0e9 100644
--- a/src/scribe_data/resources/data_type_metadata.json
+++ b/src/scribe_data/resources/data_type_metadata.json
@@ -11,5 +11,6 @@
   "prepositions": "Q4833830",
   "pronouns": "Q36224",
   "proper_nouns": "Q147276",
+  "translations": "Q21112633",
   "verbs": "Q24905"
 }
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 6f615444..0381816e 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -694,7 +694,6 @@ def check_lexeme_dump_prompt_download(output_dir: str):
                     latest_dump = max(dated_dumps, key=lambda x: x[1])[0]
 
             if latest_dump:
-                rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
                 return latest_dump
 
             else:
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index d0fbcc6b..a58b6263 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -20,39 +20,55 @@
     -->
 """
 
+from pathlib import Path
 from rich import print as rprint
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
+from scribe_data.wiktionary.parse_dump import parse_dump
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
 sparql.setMethod(POST)
 
 
-def parse_wd_lexeme_dump(wikidata_dump: str = None):
+def parse_wd_lexeme_dump(
+    language: str = None,
+    wikidata_dump_type: str = None,
+    type_output_dir: str = None,
+    wikidata_dump_path: str = None,
+):
     """
     Checks for the existence of a Wikidata dump and parses it if possible.
 
     Parameters
     ----------
-        wikidata_dump : str
+        wikidata_dump_path : str
             The local Wikidata dump that should be used to get data.
-
+        output_dir : str
+            The directory to save the parsed data.
+        language : str
+            The language to parse the data for.
     Returns
     -------
         The requested data saved locally given file type and location arguments.
     """
-    if wikidata_dump:
-        wd_lexeme_dump_download_wrapper(None, wikidata_dump)
+    file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
-    else:
-        file_path = wd_lexeme_dump_download_wrapper()
-        if isinstance(file_path, str) and file_path:
+    if isinstance(file_path, (str, Path)):
+        path = Path(file_path)
+        if path.exists():
             rprint(
                 "[bold green]We'll use the following lexeme dump[/bold green]",
                 file_path,
             )
-            rprint(
-                "[bold red]Parsing Wikidata lexeme dump feature will be available soon...[/bold red]"
+            parse_dump(
+                language=language,
+                parse_type=wikidata_dump_type,
+                type_output_dir=type_output_dir,
+                file_path=file_path,
             )
+
+            return
+
+    rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
new file mode 100644
index 00000000..99c22d7a
--- /dev/null
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -0,0 +1,236 @@
+import bz2
+import orjson
+from collections import defaultdict
+import time
+import json
+from typing import Dict, Any
+from pathlib import Path
+import logging
+from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
+from scribe_data.utils import language_metadata
+from tqdm import tqdm
+
+# MARK: Logging
+logging.basicConfig(
+    filename="lexeme_processor.log",
+    filemode="a",
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.ERROR,
+)
+
+
+class LexemeProcessor:
+    def __init__(self, target_iso: str = None):
+        self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+        self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
+        self.target_iso = target_iso
+        # Used to filter the language_metadata to only include the target language and its sublanguages
+        # If target_iso is not set, then all languages are included
+        self.iso_to_name = {}
+        for lang_name, data in language_metadata.items():
+            if lang_name == self.target_iso:
+                self.iso_to_name[data["iso"]] = lang_name
+                break
+            if not self.target_iso:
+                if "iso" in data:
+                    self.iso_to_name[data["iso"]] = lang_name
+                elif "sub_languages" in data:
+                    for sublang_data in data["sub_languages"].values():
+                        if "iso" in sublang_data:
+                            self.iso_to_name[sublang_data["iso"]] = lang_name
+
+    def process_lines_for_translations(self, line: str) -> Dict[str, Any]:
+        """
+        Process a single line of lexeme data.
+        """
+        try:
+            lexeme = orjson.loads(line.strip().rstrip(","))
+
+            lemmas = lexeme.get("lemmas", {})
+            datatype = lexeme.get("lexicalCategory")
+            senses = lexeme.get("senses", [])
+
+            # Skip invalid entries
+            if not lemmas or not datatype:
+                return {}
+
+            # Get the first lemma
+            first_lang, first_lemma = next(iter(lemmas.items()))
+            word = first_lemma.get("value", "").lower()  # Normalize to lowercase
+            word_lang = first_lemma.get("language", "")
+
+            # Skip if word is empty or language ISO is not in our metadata
+            if not word or word_lang not in self.iso_to_name:
+                return {}
+
+            # Process all senses and their translations
+            translations = {}
+            for sense in senses:
+                glosses = sense.get("glosses", {})
+                translations.update(
+                    {
+                        lang_code: gloss["value"]
+                        for lang_code, gloss in glosses.items()
+                        if lang_code
+                        in self.iso_to_name  # Only keep translations for known languages
+                    }
+                )
+
+            if not translations:
+                return {}
+
+            self.word_index[word][word_lang][datatype] = translations
+
+            return {word: {word_lang: {datatype: translations}}}
+        except Exception as e:
+            logging.error(f"Error processing line: {e}")
+            return {}
+
+    def process_file(self, file_path: str, batch_size: int = 1000) -> None:
+        start_time = time.time()
+
+        try:
+            # Get file size and estimate number of entries (average 263 bytes per entry based on real data)
+            total_entries = int(Path(file_path).stat().st_size / 263)
+
+            with bz2.open(file_path, "rt", encoding="utf-8") as bzfile:
+                first_line = bzfile.readline()
+                if not first_line.strip().startswith("["):
+                    bzfile.seek(0)
+
+                batch = []
+                # Use dynamic total based on file size
+                for line in tqdm(
+                    bzfile, desc="Processing entries", total=total_entries
+                ):
+                    stripped_line = line.strip()
+                    if stripped_line in [
+                        "]",
+                        "[",
+                        ",",
+                        "",
+                    ]:  # Skip structural JSON elements
+                        continue
+
+                    batch.append(line)
+
+                    if len(batch) >= batch_size:
+                        self._process_batch(batch)
+                        batch = []
+
+                    self.stats["processed_entries"] += 1
+
+                # Process remaining items
+                if batch:
+                    self._process_batch(batch)
+
+            self.stats["processing_time"] = time.time() - start_time
+            self.stats["unique_words"] = len(self.word_index)
+            print(
+                f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds"
+            )
+
+        except FileNotFoundError:
+            logging.error(f"File not found: {file_path}")
+            print(f"Error: File not found - {file_path}")
+        except Exception as e:
+            logging.error(f"Error processing file: {e}")
+            print(f"Error processing file: {e}")
+
+    def _process_batch(self, batch: list) -> None:
+        for line in batch:
+            # self.process_lines_for_forms(line)
+            self.process_lines_for_translations(line)
+
+    def save_index(self, filepath: str, language_iso: str = None) -> None:
+        """
+        Save index to file, optionally filtering by language ISO code.
+        """
+        if language_iso:
+            # Only proceed if we have a valid ISO code
+            if language_iso not in self.iso_to_name:
+                print(f"Warning: Unknown ISO code {language_iso}, skipping...")
+                return
+
+            # Get full language name
+            full_language_name = self.iso_to_name[language_iso]
+
+            # Filter word_index for specific language
+            filtered_index = {}
+            for word, lang_data in self.word_index.items():
+                if language_iso in lang_data:
+                    filtered_index[word] = {language_iso: lang_data[language_iso]}
+
+            # Create language-specific filepath using full name
+            base_path = Path(filepath)
+            lang_filepath = base_path.parent / full_language_name / base_path.name
+            lang_filepath.parent.mkdir(parents=True, exist_ok=True)
+
+            print(f"Saving {full_language_name} index to {lang_filepath}...")
+            with open(lang_filepath, "w", encoding="utf-8") as f:
+                json.dump(filtered_index, f, indent=2, ensure_ascii=False)
+        else:
+            print(f"Saving complete index to {filepath}...")
+            with open(filepath, "w", encoding="utf-8") as f:
+                json.dump(
+                    self._convert_defaultdict_to_dict(self.word_index),
+                    f,
+                    indent=2,
+                    ensure_ascii=False,
+                )
+
+    def _convert_defaultdict_to_dict(self, dd):
+        if isinstance(dd, defaultdict):
+            dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()}
+        return dd
+
+    def load_index(self, filepath: str) -> None:
+        print(f"Loading index from {filepath}...")
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                loaded_data = json.load(f)
+                self.word_index = defaultdict(
+                    lambda: defaultdict(lambda: defaultdict(dict))
+                )
+                self._recursive_update(self.word_index, loaded_data)
+        except FileNotFoundError:
+            logging.error(f"Index file not found: {filepath}")
+            print(f"Error: Index file not found - {filepath}")
+        except Exception as e:
+            logging.error(f"Error loading index: {e}")
+            print(f"Error loading index: {e}")
+
+    def _recursive_update(self, dd, data):
+        for key, value in data.items():
+            if isinstance(value, dict):
+                dd[key] = defaultdict(lambda: defaultdict(dict))
+                self._recursive_update(dd[key], value)
+            else:
+                dd[key] = value
+
+    def get_word_info(self, word: str) -> Dict[str, Any]:
+        return self.word_index.get(word.lower(), {})
+
+
+def parse_dump(
+    language: str = None,
+    parse_type: str = None,
+    type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR,
+    file_path: str = "latest-lexemes.json.bz2",
+):
+    index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+
+    processor = LexemeProcessor(target_iso=language)
+
+    print("Processing the lexeme data file...")
+    processor.process_file(file_path)
+
+    # Get unique ISO codes from the processed data
+    iso_codes = set()
+    for word_data in processor.word_index.values():
+        iso_codes.update(word_data.keys())
+
+    # Save individual files for each valid language
+    for iso_code in iso_codes:
+        if iso_code in processor.iso_to_name:  # Only process known ISO codes
+            processor.save_index(str(index_path), iso_code)
diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py
index c77c524a..29b24751 100644
--- a/tests/cli/test_download.py
+++ b/tests/cli/test_download.py
@@ -107,12 +107,10 @@ def test_download_wd_lexeme_dump_by_date(self, mock_findall, mock_get):
     @patch("scribe_data.cli.download.open", new_callable=mock_open)
     @patch("scribe_data.cli.download.tqdm")
     @patch("scribe_data.cli.download.os.makedirs")
-    @patch(
-        "scribe_data.cli.download.input", return_value="y"
-    )  # Mocking input to return 'y'
+    @patch("scribe_data.cli.download.questionary.confirm")
     def test_wd_lexeme_dump_download_wrapper_latest(
         self,
-        mock_input,
+        mock_confirm,
         mock_makedirs,
         mock_tqdm,
         mock_file,
@@ -122,6 +120,8 @@ def test_wd_lexeme_dump_download_wrapper_latest(
         """
         Test wrapper function for downloading latest Wikidata lexeme dump.
         """
+        mock_confirm.return_value.ask.return_value = True
+
         mock_get.return_value.text = 'href="latest-all.json.bz2"'
         mock_get.return_value.raise_for_status = MagicMock()
         mock_get.return_value.headers = {"content-length": "100"}
@@ -135,9 +135,7 @@ def test_wd_lexeme_dump_download_wrapper_latest(
             self.assertIsNotNone(download_path, "Download path should not be None")
             self.assertIn("latest-lexemes.json.bz2", download_path)
             mock_makedirs.assert_called_with("test_export_dir", exist_ok=True)
-            mock_input.assert_called_with(
-                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities.\nDo you want to proceed? (y/n): "
-            )
+            mock_confirm.assert_called_once()
 
     @patch("scribe_data.utils.select")
     @patch(
diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
index 2a5e9c4d..f4e46bb0 100644
--- a/tests/cli/test_get.py
+++ b/tests/cli/test_get.py
@@ -48,8 +48,10 @@ def test_invalid_arguments(self):
     # MARK: All Data
 
     @patch("scribe_data.cli.get.query_data")
-    @patch("builtins.input", lambda _: "N")  # don't use dump
-    def test_get_all_data_types_for_language(self, mock_query_data):
+    @patch("scribe_data.cli.get.questionary.confirm")
+    def test_get_all_data_types_for_language(self, mock_confirm, mock_query_data):
+        mock_confirm.return_value.ask.return_value = False
+
         get_data(all=True, language="English")
         mock_query_data.assert_called_once_with(
             languages=["English"],
@@ -59,8 +61,10 @@ def test_get_all_data_types_for_language(self, mock_query_data):
         )
 
     @patch("scribe_data.cli.get.query_data")
-    @patch("builtins.input", lambda _: "N")  # don't use dump
-    def test_get_all_languages_for_data_type(self, mock_query_data):
+    @patch("scribe_data.cli.get.questionary.confirm")
+    def test_get_all_languages_for_data_type(self, mock_confirm, mock_query_data):
+        mock_confirm.return_value.ask.return_value = False
+
         get_data(all=True, data_type="nouns")
         mock_query_data.assert_called_once_with(
             languages=None,

From bb9e3338fcc1af183a59643522f6cfc7e8b06937 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Thu, 26 Dec 2024 22:59:09 +0600
Subject: [PATCH 03/13] forms total add ( missing total translations need
 feedback)

---
 src/scribe_data/cli/main.py              |   3 +-
 src/scribe_data/cli/total.py             |  30 +++-
 src/scribe_data/wiktionary/parse_dump.py | 186 +++++++++++++++++------
 3 files changed, 165 insertions(+), 54 deletions(-)

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index e2a7bb94..57ac7973 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -200,7 +200,8 @@ def main() -> None:
     total_parser.add_argument(
         "-wdp",
         "--wikidata-dump-path",
-        type=str,
+        nargs="?",
+        const=True,
         help="Path to a local Wikidata lexemes dump for running with '--all'.",
     )
 
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 71881dda..37016c97 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -36,6 +36,7 @@
     list_all_languages,
 )
 from scribe_data.wikidata.wikidata_utils import sparql
+from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 
 
 def get_qid_by_input(input_str):
@@ -370,7 +371,7 @@ def total_wrapper(
     language: Union[str, List[str]] = None,
     data_type: Union[str, List[str]] = None,
     all_bool: bool = False,
-    wikidata_dump: str = None,
+    wikidata_dump: Union[str, bool] = None,
 ) -> None:
     """
     Conditionally provides the full functionality of the total command.
@@ -387,10 +388,33 @@ def total_wrapper(
         all_bool : boolean
             Whether all languages and data types should be listed.
 
-        wikidata_dump : str
-            The local Wikidata dump that can be used to process data.
+        wikidata_dump : Union[str, bool]
+            The local Wikidata dump path that can be used to process data.
+            If True, indicates the flag was used without a path.
     """
 
+    if wikidata_dump is True:  # flag without a wikidata dump path
+        if all_bool:
+            language = "all"
+        parse_wd_lexeme_dump(
+            language=language,
+            wikidata_dump_type="total",
+            type_output_dir=None,
+            wikidata_dump_path=None,
+        )
+        return
+
+    if isinstance(wikidata_dump, str):  # if user provided a wikidata dump path
+        if all_bool:
+            language = "all"
+        parse_wd_lexeme_dump(
+            language=language,
+            wikidata_dump_type="total",
+            type_output_dir=None,
+            wikidata_dump_path=wikidata_dump,
+        )
+        return
+
     if (not language and not data_type) and all_bool:
         print_total_lexemes()
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 99c22d7a..69b2782d 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -9,6 +9,9 @@
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
 from scribe_data.utils import language_metadata
 from tqdm import tqdm
+from collections import Counter
+
+from scribe_data.utils import data_type_metadata
 
 # MARK: Logging
 logging.basicConfig(
@@ -20,10 +23,12 @@
 
 
 class LexemeProcessor:
-    def __init__(self, target_iso: str = None):
+    def __init__(self, target_iso: str = None, parse_type: str = None):
         self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
         self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
         self.target_iso = target_iso
+        self.parse_type = parse_type
+        self.lexical_category_counts = defaultdict(Counter)
         # Used to filter the language_metadata to only include the target language and its sublanguages
         # If target_iso is not set, then all languages are included
         self.iso_to_name = {}
@@ -39,49 +44,89 @@ def __init__(self, target_iso: str = None):
                         if "iso" in sublang_data:
                             self.iso_to_name[sublang_data["iso"]] = lang_name
 
-    def process_lines_for_translations(self, line: str) -> Dict[str, Any]:
+    def _process_lexeme_translations(self, lexeme: dict) -> dict:
         """
-        Process a single line of lexeme data.
+        Process lexeme translations from lemmas, datatype and senses.
+        Returns a dictionary with word translations or empty dict if invalid.
         """
-        try:
-            lexeme = orjson.loads(line.strip().rstrip(","))
+        lemmas = lexeme.get("lemmas", {})
+        datatype = lexeme.get("lexicalCategory")
+        senses = lexeme.get("senses", [])
+
+        # Skip invalid entries
+        if not lemmas or not datatype:
+            return {}
+
+        # Get the first lemma
+        first_lang, first_lemma = next(iter(lemmas.items()))
+        word = first_lemma.get("value", "").lower()  # Normalize to lowercase
+        word_lang = first_lemma.get("language", "")
+
+        # Skip if word is empty or language ISO is not in our metadata
+        if not word or word_lang not in self.iso_to_name:
+            return {}
+
+        # Process all senses and their translations
+        translations = {}
+        for sense in senses:
+            glosses = sense.get("glosses", {})
+            translations.update(
+                {
+                    lang_code: gloss["value"]
+                    for lang_code, gloss in glosses.items()
+                    if lang_code
+                    in self.iso_to_name  # Only keep translations for known languages
+                }
+            )
+
+        if not translations:
+            return {}
+
+        self.word_index[word][word_lang][datatype] = translations
+        return {word: {word_lang: {datatype: translations}}}
 
-            lemmas = lexeme.get("lemmas", {})
-            datatype = lexeme.get("lexicalCategory")
-            senses = lexeme.get("senses", [])
-
-            # Skip invalid entries
-            if not lemmas or not datatype:
-                return {}
-
-            # Get the first lemma
-            first_lang, first_lemma = next(iter(lemmas.items()))
-            word = first_lemma.get("value", "").lower()  # Normalize to lowercase
-            word_lang = first_lemma.get("language", "")
-
-            # Skip if word is empty or language ISO is not in our metadata
-            if not word or word_lang not in self.iso_to_name:
-                return {}
-
-            # Process all senses and their translations
-            translations = {}
-            for sense in senses:
-                glosses = sense.get("glosses", {})
-                translations.update(
-                    {
-                        lang_code: gloss["value"]
-                        for lang_code, gloss in glosses.items()
-                        if lang_code
-                        in self.iso_to_name  # Only keep translations for known languages
-                    }
+    def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]:
+        """
+        Process lexeme forms from lemmas, datatype and senses.
+        Returns a dictionary with word translations or empty dict if invalid.
+        """
+
+        lexicalCategory = lexeme.get("lexicalCategory")
+
+        # Skip if lexicalCategory is missing or not in our data types
+        if not lexicalCategory or lexicalCategory not in data_type_metadata.values():
+            return {}
+        lemmas = lexeme.get("lemmas", {})
+
+        for lemma in lemmas.values():
+            lang = lemma.get("language")
+            if lang in self.iso_to_name:
+                # Convert QID to category name
+                category_name = next(
+                    (
+                        key
+                        for key, qid in data_type_metadata.items()
+                        if qid == lexicalCategory
+                    ),
+                    None,
                 )
+                if category_name:
+                    # Store counts per language
+                    self.lexical_category_counts[lang][category_name] += 1
+                break
 
-            if not translations:
-                return {}
+    def process_lines(self, line: str) -> Dict[str, Any]:
+        """
+        Process a single line of lexeme data.
+        """
+        try:
+            lexeme = orjson.loads(line.strip().rstrip(","))
 
-            self.word_index[word][word_lang][datatype] = translations
+            if self.parse_type == "translations":
+                return self._process_lexeme_translations(lexeme)
+            elif self.parse_type == "total":
+                return self._process_lexeme_total(lexeme)
 
-            return {word: {word_lang: {datatype: translations}}}
         except Exception as e:
             logging.error(f"Error processing line: {e}")
             return {}
@@ -129,6 +174,30 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
             print(
                 f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds"
             )
+            if self.parse_type == "total":
+                print(
+                    f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}"
+                )
+                print("=" * 70)
+
+                # Print counts for each language
+                for lang, counts in self.lexical_category_counts.items():
+                    lang_name = self.iso_to_name[lang]
+                    # Print first row with language name
+                    first_category = True
+                    for category, count in counts.most_common():
+                        if first_category:
+                            print(f"{lang_name:<20} {category:<25} {count:<25,}")
+                            first_category = False
+                        else:
+                            # Print subsequent rows with blank language column
+                            print(f"{'':<20} {category:<25} {count:<25,}")
+                    # Add blank line between languages, but not after the last language
+                    if lang != list(self.lexical_category_counts.keys())[-1]:
+                        print(
+                            f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}"
+                        )
+                        print("=" * 70)
 
         except FileNotFoundError:
             logging.error(f"File not found: {file_path}")
@@ -140,7 +209,7 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
     def _process_batch(self, batch: list) -> None:
         for line in batch:
             # self.process_lines_for_forms(line)
-            self.process_lines_for_translations(line)
+            self.process_lines(line)
 
     def save_index(self, filepath: str, language_iso: str = None) -> None:
         """
@@ -218,19 +287,36 @@ def parse_dump(
     type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR,
     file_path: str = "latest-lexemes.json.bz2",
 ):
-    index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+    if parse_type == "total":
+        if language == "all":
+            print("Processing all lexemes...")
+            processor = LexemeProcessor(target_iso=None, parse_type=parse_type)
+        else:
+            print(f"Processing lexemes for {language}...")
+            processor = LexemeProcessor(target_iso=language, parse_type=parse_type)
+
+        processor.process_file(file_path)
+
+    else:
+        # Create the output directory if it doesn't exist
+        Path(type_output_dir).mkdir(parents=True, exist_ok=True)
+
+        index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+        print(f"Will save index to: {index_path}")
+
+        processor = LexemeProcessor(target_iso=language, parse_type=parse_type)
 
-    processor = LexemeProcessor(target_iso=language)
+        print("Processing the lexeme data file...")
+        processor.process_file(file_path)
 
-    print("Processing the lexeme data file...")
-    processor.process_file(file_path)
+        print(f"Found {len(processor.word_index)} words in total")
 
-    # Get unique ISO codes from the processed data
-    iso_codes = set()
-    for word_data in processor.word_index.values():
-        iso_codes.update(word_data.keys())
+        # Get unique ISO codes from the processed data
+        iso_codes = set()
+        for word_data in processor.word_index.values():
+            iso_codes.update(word_data.keys())
 
-    # Save individual files for each valid language
-    for iso_code in iso_codes:
-        if iso_code in processor.iso_to_name:  # Only process known ISO codes
-            processor.save_index(str(index_path), iso_code)
+        # Save individual files for each valid language
+        for iso_code in iso_codes:
+            if iso_code in processor.iso_to_name:  # Only process known ISO codes
+                processor.save_index(str(index_path), iso_code)

From 7eddf072e384b640cf06f984657b95b88c6eee6d Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Fri, 27 Dec 2024 03:12:33 +0600
Subject: [PATCH 04/13] translation add in total, remove loggings

---
 src/scribe_data/wiktionary/parse_dump.py | 156 ++++++++++++-----------
 1 file changed, 79 insertions(+), 77 deletions(-)

diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 69b2782d..8ad9adfe 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -5,7 +5,6 @@
 import json
 from typing import Dict, Any
 from pathlib import Path
-import logging
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
 from scribe_data.utils import language_metadata
 from tqdm import tqdm
@@ -13,77 +12,75 @@
 
 from scribe_data.utils import data_type_metadata
 
-# MARK: Logging
-logging.basicConfig(
-    filename="lexeme_processor.log",
-    filemode="a",
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    level=logging.ERROR,
-)
-
 
 class LexemeProcessor:
     def __init__(self, target_iso: str = None, parse_type: str = None):
-        self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+        self.word_index = {}
         self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
         self.target_iso = target_iso
         self.parse_type = parse_type
-        self.lexical_category_counts = defaultdict(Counter)
-        # Used to filter the language_metadata to only include the target language and its sublanguages
-        # If target_iso is not set, then all languages are included
-        self.iso_to_name = {}
+        self.lexical_category_counts = {}
+        self.translation_counts = {}
+        self._category_lookup = {v: k for k, v in data_type_metadata.items() if v}
+        self.iso_to_name = self._build_iso_mapping()
+
+    def _build_iso_mapping(self) -> dict:
+        """Build mapping of ISO codes to language names"""
+        iso_mapping = {}
+
         for lang_name, data in language_metadata.items():
-            if lang_name == self.target_iso:
-                self.iso_to_name[data["iso"]] = lang_name
-                break
-            if not self.target_iso:
-                if "iso" in data:
-                    self.iso_to_name[data["iso"]] = lang_name
-                elif "sub_languages" in data:
-                    for sublang_data in data["sub_languages"].values():
-                        if "iso" in sublang_data:
-                            self.iso_to_name[sublang_data["iso"]] = lang_name
+            if self.target_iso and lang_name != self.target_iso:
+                continue
+
+            if "iso" in data:
+                iso_mapping[data["iso"]] = lang_name
+
+            if not self.target_iso and "sub_languages" in data:
+                for sublang_data in data["sub_languages"].values():
+                    if "iso" in sublang_data:
+                        iso_mapping[sublang_data["iso"]] = lang_name
+
+        return iso_mapping
 
     def _process_lexeme_translations(self, lexeme: dict) -> dict:
-        """
-        Process lexeme translations from lemmas, datatype and senses.
-        Returns a dictionary with word translations or empty dict if invalid.
-        """
+        """Process lexeme translations from lemmas and senses"""
         lemmas = lexeme.get("lemmas", {})
-        datatype = lexeme.get("lexicalCategory")
-        senses = lexeme.get("senses", [])
+        q_code = lexeme.get("lexicalCategory")
 
-        # Skip invalid entries
-        if not lemmas or not datatype:
+        # Convert Q-code to actual category name (e.g., Q1084 -> nouns)
+        category_name = self._category_lookup.get(q_code)
+
+        if not (lemmas and category_name):
             return {}
 
-        # Get the first lemma
-        first_lang, first_lemma = next(iter(lemmas.items()))
-        word = first_lemma.get("value", "").lower()  # Normalize to lowercase
-        word_lang = first_lemma.get("language", "")
+        try:
+            first_lang, first_lemma = next(iter(lemmas.items()))
+            word = first_lemma.get("value", "").lower()
+            word_lang = first_lemma.get("language", "")
 
-        # Skip if word is empty or language ISO is not in our metadata
-        if not word or word_lang not in self.iso_to_name:
-            return {}
+            if not (word and word_lang in self.iso_to_name):
+                return {}
 
-        # Process all senses and their translations
-        translations = {}
-        for sense in senses:
-            glosses = sense.get("glosses", {})
-            translations.update(
-                {
-                    lang_code: gloss["value"]
-                    for lang_code, gloss in glosses.items()
-                    if lang_code
-                    in self.iso_to_name  # Only keep translations for known languages
-                }
-            )
+            translations = {
+                lang_code: gloss["value"]
+                for sense in lexeme.get("senses", [])
+                for lang_code, gloss in sense.get("glosses", {}).items()
+                if lang_code in self.iso_to_name
+            }
 
-        if not translations:
-            return {}
+            if translations:
+                if word not in self.word_index:
+                    self.word_index[word] = {}
+                if word_lang not in self.word_index[word]:
+                    self.word_index[word][word_lang] = {}
 
-        self.word_index[word][word_lang][datatype] = translations
-        return {word: {word_lang: {datatype: translations}}}
+                self.word_index[word][word_lang][category_name] = translations
+                return {word: {word_lang: {category_name: translations}}}
+
+        except (StopIteration, AttributeError):
+            pass
+
+        return {}
 
     def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]:
         """
@@ -98,23 +95,27 @@ def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]:
             return {}
         lemmas = lexeme.get("lemmas", {})
 
+        category_name = self._category_lookup.get(lexicalCategory)
+        if not category_name:
+            return {}
+
+        # Process only the first valid language entry
         for lemma in lemmas.values():
             lang = lemma.get("language")
             if lang in self.iso_to_name:
-                # Convert QID to category name
-                category_name = next(
-                    (
-                        key
-                        for key, qid in data_type_metadata.items()
-                        if qid == lexicalCategory
-                    ),
-                    None,
+                if lang not in self.lexical_category_counts:
+                    self.lexical_category_counts[lang] = Counter()
+                    self.translation_counts[lang] = Counter()
+                # Update counts
+                self.lexical_category_counts[lang][category_name] += 1
+                translation_count = sum(
+                    len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
                 )
-                if category_name:
-                    # Store counts per language
-                    self.lexical_category_counts[lang][category_name] += 1
+                self.translation_counts[lang][category_name] += translation_count
                 break
 
+        return {}
+
     def process_lines(self, line: str) -> Dict[str, Any]:
         """
         Process a single line of lexeme data.
@@ -128,7 +129,7 @@ def process_lines(self, line: str) -> Dict[str, Any]:
                 return self._process_lexeme_total(lexeme)
 
         except Exception as e:
-            logging.error(f"Error processing line: {e}")
+            print(f"Error processing line: {e}")
             return {}
 
     def process_file(self, file_path: str, batch_size: int = 1000) -> None:
@@ -176,9 +177,9 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
             )
             if self.parse_type == "total":
                 print(
-                    f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}"
+                    f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}"
                 )
-                print("=" * 70)
+                print("=" * 90)
 
                 # Print counts for each language
                 for lang, counts in self.lexical_category_counts.items():
@@ -186,24 +187,27 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
                     # Print first row with language name
                     first_category = True
                     for category, count in counts.most_common():
+                        translation_count = self.translation_counts[lang][category]
                         if first_category:
-                            print(f"{lang_name:<20} {category:<25} {count:<25,}")
+                            print(
+                                f"{lang_name:<20} {category:<25} {count:<25,} {translation_count:<20,}"
+                            )
                             first_category = False
                         else:
                             # Print subsequent rows with blank language column
-                            print(f"{'':<20} {category:<25} {count:<25,}")
+                            print(
+                                f"{'':<20} {category:<25} {count:<25,} {translation_count:<20,}"
+                            )
                     # Add blank line between languages, but not after the last language
                     if lang != list(self.lexical_category_counts.keys())[-1]:
                         print(
-                            f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}"
+                            f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}"
                         )
-                        print("=" * 70)
+                        print("=" * 90)
 
         except FileNotFoundError:
-            logging.error(f"File not found: {file_path}")
             print(f"Error: File not found - {file_path}")
         except Exception as e:
-            logging.error(f"Error processing file: {e}")
             print(f"Error processing file: {e}")
 
     def _process_batch(self, batch: list) -> None:
@@ -263,10 +267,8 @@ def load_index(self, filepath: str) -> None:
                 )
                 self._recursive_update(self.word_index, loaded_data)
         except FileNotFoundError:
-            logging.error(f"Index file not found: {filepath}")
             print(f"Error: Index file not found - {filepath}")
         except Exception as e:
-            logging.error(f"Error loading index: {e}")
             print(f"Error loading index: {e}")
 
     def _recursive_update(self, dd, data):

From 7c4d597eea54570ef4dc314b7b678705456db362 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sat, 28 Dec 2024 02:37:06 +0600
Subject: [PATCH 05/13] overwrite file feature added

---
 src/scribe_data/cli/main.py              |  3 +++
 src/scribe_data/wiktionary/parse_dump.py | 26 +++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 57ac7973..281a05f2 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -415,6 +415,9 @@ def main() -> None:
             )
 
         elif args.command in ["interactive", "i"]:
+            rprint(
+                f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]"
+            )
             action = select(
                 "What would you like to do?",
                 choices=[
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 8ad9adfe..10d7b2da 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -9,6 +9,7 @@
 from scribe_data.utils import language_metadata
 from tqdm import tqdm
 from collections import Counter
+import questionary
 
 from scribe_data.utils import data_type_metadata
 
@@ -283,6 +284,19 @@ def get_word_info(self, word: str) -> Dict[str, Any]:
         return self.word_index.get(word.lower(), {})
 
 
+def check_index_exists(index_path: Path) -> bool:
+    """Check if index file exists and prompt user for action if it does."""
+    if index_path.exists():
+        print(f"\nIndex file already exists at: {index_path}")
+        choice = questionary.select(
+            "Choose an action:",
+            choices=["Overwrite existing data", "Skip process"],
+            default="Skip process",
+        ).ask()
+        return choice == "Skip process"
+    return False
+
+
 def parse_dump(
     language: str = None,
     parse_type: str = None,
@@ -303,7 +317,17 @@ def parse_dump(
         # Create the output directory if it doesn't exist
         Path(type_output_dir).mkdir(parents=True, exist_ok=True)
 
-        index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+        if language:
+            index_path = (
+                Path(type_output_dir) / language / f"lexeme_index_{parse_type}.json"
+            )
+            if check_index_exists(index_path):
+                return
+        else:
+            index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+            if check_index_exists(index_path):
+                return
+
         print(f"Will save index to: {index_path}")
 
         processor = LexemeProcessor(target_iso=language, parse_type=parse_type)

From 877e6b2485a8ef5ba8cfd53e0f48ebf119f81933 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sat, 28 Dec 2024 17:25:01 +0600
Subject: [PATCH 06/13] final and clean code

---
 .gitignore                                 |   1 -
 src/scribe_data/utils.py                   |  16 ++
 src/scribe_data/wikidata/wikidata_utils.py |  14 +-
 src/scribe_data/wiktionary/parse_dump.py   | 187 +++++++++++----------
 4 files changed, 125 insertions(+), 93 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0f860902..610b9da8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,4 +44,3 @@ scribe_data_tsv_export/*
 # MARK: Wiki Dumps
 
 *.json.bz2
-*.log
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 0381816e..36f25877 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -25,6 +25,7 @@
 import json
 import os
 import re
+import questionary
 from datetime import datetime
 from importlib import resources
 from pathlib import Path
@@ -703,3 +704,18 @@ def check_lexeme_dump_prompt_download(output_dir: str):
         else:
             rprint("[bold blue]Skipping download.[/bold blue]")
             return True
+
+
+def check_index_exists(index_path: Path) -> bool:
+    """
+    Check if JSON wiktionary dump file exists and prompt user for action if it does.
+    """
+    if index_path.exists():
+        print(f"\nIndex file already exists at: {index_path}")
+        choice = questionary.select(
+            "Choose an action:",
+            choices=["Overwrite existing data", "Skip process"],
+            default="Skip process",
+        ).ask()
+        return choice == "Skip process"
+    return False
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index a58b6263..7ab46f7a 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -43,12 +43,14 @@ def parse_wd_lexeme_dump(
 
     Parameters
     ----------
-        wikidata_dump_path : str
-            The local Wikidata dump that should be used to get data.
-        output_dir : str
-            The directory to save the parsed data.
-        language : str
-            The language to parse the data for.
+    language : str
+        The language to parse the data for.
+    wikidata_dump_type : str
+        The type of Wikidata dump to parse (e.g. "total", "translations").
+    type_output_dir : str
+        The directory to save the parsed JSON data.
+    wikidata_dump_path : str
+        The local Wikidata dump directory that should be used to get data.
     Returns
     -------
         The requested data saved locally given file type and location arguments.
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 10d7b2da..e4a121f5 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -1,27 +1,51 @@
+"""
+Functions for parsing Wikidata lexeme dumps.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
 import bz2
 import orjson
-from collections import defaultdict
 import time
 import json
-from typing import Dict, Any
-from pathlib import Path
-from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
-from scribe_data.utils import language_metadata
-from tqdm import tqdm
-from collections import Counter
-import questionary
 
-from scribe_data.utils import data_type_metadata
+from tqdm import tqdm
+from pathlib import Path
+from collections import defaultdict, Counter
+from typing import Dict, Any
+from scribe_data.utils import (
+    DEFAULT_DUMP_EXPORT_DIR,
+    language_metadata,
+    data_type_metadata,
+    check_index_exists,
+)
 
 
 class LexemeProcessor:
     def __init__(self, target_iso: str = None, parse_type: str = None):
-        self.word_index = {}
+        # Pre-compute lookups once during initialization
+        self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
         self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
         self.target_iso = target_iso
         self.parse_type = parse_type
-        self.lexical_category_counts = {}
-        self.translation_counts = {}
+        self.lexical_category_counts = defaultdict(Counter)
+        self.translation_counts = defaultdict(Counter)
         self._category_lookup = {v: k for k, v in data_type_metadata.items() if v}
         self.iso_to_name = self._build_iso_mapping()
 
@@ -46,40 +70,37 @@ def _build_iso_mapping(self) -> dict:
     def _process_lexeme_translations(self, lexeme: dict) -> dict:
         """Process lexeme translations from lemmas and senses"""
         lemmas = lexeme.get("lemmas", {})
-        q_code = lexeme.get("lexicalCategory")
+        qid = lexeme.get("lexicalCategory")
 
-        # Convert Q-code to actual category name (e.g., Q1084 -> nouns)
-        category_name = self._category_lookup.get(q_code)
-
-        if not (lemmas and category_name):
+        # Early return if missing required data
+        if not (lemmas and qid):
+            return {}
+        # Convert Qid to actual category name (e.g., Q1084 -> nouns)
+        category_name = self._category_lookup.get(qid)
+        if not category_name:
             return {}
 
-        try:
-            first_lang, first_lemma = next(iter(lemmas.items()))
-            word = first_lemma.get("value", "").lower()
-            word_lang = first_lemma.get("language", "")
+        # Process first valid lemma
+        for lang_code, lemma_data in lemmas.items():
+            if lang_code not in self.iso_to_name:
+                continue
 
-            if not (word and word_lang in self.iso_to_name):
-                return {}
+            word = lemma_data.get("value", "").lower()
+            if not word:
+                continue
 
-            translations = {
-                lang_code: gloss["value"]
-                for sense in lexeme.get("senses", [])
-                for lang_code, gloss in sense.get("glosses", {}).items()
-                if lang_code in self.iso_to_name
-            }
+            # Collect all valid translations in one pass
+            translations = {}
+            for sense in lexeme.get("senses", []):
+                for lang_code, gloss in sense.get("glosses", {}).items():
+                    if lang_code in self.iso_to_name:
+                        translations[lang_code] = gloss["value"]
 
             if translations:
-                if word not in self.word_index:
-                    self.word_index[word] = {}
-                if word_lang not in self.word_index[word]:
-                    self.word_index[word][word_lang] = {}
-
-                self.word_index[word][word_lang][category_name] = translations
-                return {word: {word_lang: {category_name: translations}}}
+                self.word_index[word][lang_code][category_name] = translations
+                return {word: {lang_code: {category_name: translations}}}
 
-        except (StopIteration, AttributeError):
-            pass
+            break  # Process only first valid lemma
 
         return {}
 
@@ -133,7 +154,7 @@ def process_lines(self, line: str) -> Dict[str, Any]:
             print(f"Error processing line: {e}")
             return {}
 
-    def process_file(self, file_path: str, batch_size: int = 1000) -> None:
+    def process_file(self, file_path: str, batch_size: int = 50000):
         start_time = time.time()
 
         try:
@@ -146,7 +167,6 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
                     bzfile.seek(0)
 
                 batch = []
-                # Use dynamic total based on file size
                 for line in tqdm(
                     bzfile, desc="Processing entries", total=total_entries
                 ):
@@ -212,11 +232,13 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None:
             print(f"Error processing file: {e}")
 
     def _process_batch(self, batch: list) -> None:
+        """
+        Process multiple lines at once
+        """
         for line in batch:
-            # self.process_lines_for_forms(line)
             self.process_lines(line)
 
-    def save_index(self, filepath: str, language_iso: str = None) -> None:
+    def export_json(self, filepath: str, language_iso: str = None) -> None:
         """
         Save index to file, optionally filtering by language ISO code.
         """
@@ -235,8 +257,13 @@ def save_index(self, filepath: str, language_iso: str = None) -> None:
                 if language_iso in lang_data:
                     filtered_index[word] = {language_iso: lang_data[language_iso]}
 
-            # Create language-specific filepath using full name
+            # Create language-specific filepath, removing potential double paths
             base_path = Path(filepath)
+            # Remove language name from base_path if it exists to prevent duplication
+            if full_language_name in base_path.parts:
+                parts = [p for p in base_path.parts if p != full_language_name]
+                base_path = Path(*parts)
+
             lang_filepath = base_path.parent / full_language_name / base_path.name
             lang_filepath.parent.mkdir(parents=True, exist_ok=True)
 
@@ -258,44 +285,6 @@ def _convert_defaultdict_to_dict(self, dd):
             dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()}
         return dd
 
-    def load_index(self, filepath: str) -> None:
-        print(f"Loading index from {filepath}...")
-        try:
-            with open(filepath, "r", encoding="utf-8") as f:
-                loaded_data = json.load(f)
-                self.word_index = defaultdict(
-                    lambda: defaultdict(lambda: defaultdict(dict))
-                )
-                self._recursive_update(self.word_index, loaded_data)
-        except FileNotFoundError:
-            print(f"Error: Index file not found - {filepath}")
-        except Exception as e:
-            print(f"Error loading index: {e}")
-
-    def _recursive_update(self, dd, data):
-        for key, value in data.items():
-            if isinstance(value, dict):
-                dd[key] = defaultdict(lambda: defaultdict(dict))
-                self._recursive_update(dd[key], value)
-            else:
-                dd[key] = value
-
-    def get_word_info(self, word: str) -> Dict[str, Any]:
-        return self.word_index.get(word.lower(), {})
-
-
-def check_index_exists(index_path: Path) -> bool:
-    """Check if index file exists and prompt user for action if it does."""
-    if index_path.exists():
-        print(f"\nIndex file already exists at: {index_path}")
-        choice = questionary.select(
-            "Choose an action:",
-            choices=["Overwrite existing data", "Skip process"],
-            default="Skip process",
-        ).ask()
-        return choice == "Skip process"
-    return False
-
 
 def parse_dump(
     language: str = None,
@@ -303,6 +292,34 @@ def parse_dump(
     type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR,
     file_path: str = "latest-lexemes.json.bz2",
 ):
+    """
+    Process and parse Wikidata lexeme dumps, either analyzing all
+    or filtering for a specific language.
+
+    Parameters
+    ----------
+    language : str,
+        ISO code of the language to process. If 'all', processes all languages.
+    parse_type : str
+        Type of parsing to perform. Options are:
+        - 'total': Generate statistics about lexeme counts
+        - 'translations': Create translation indexes
+    type_output_dir : str
+        Directory where output files will be saved. Defaults to DEFAULT_DUMP_EXPORT_DIR.
+    file_path : str
+        Path to the lexeme dump file. Defaults to 'latest-lexemes.json.bz2'.
+
+    Notes
+    -----
+    When parse_type is 'total':
+    - Total number of lexemes per language along with different lexical categories
+    - Number of total translations available
+
+    When parse_type is 'translations', it creates JSON index files containing:
+    - Word-to-translation mappings
+    - Lexical category information
+
+    """
     if parse_type == "total":
         if language == "all":
             print("Processing all lexemes...")
@@ -318,13 +335,11 @@ def parse_dump(
         Path(type_output_dir).mkdir(parents=True, exist_ok=True)
 
         if language:
-            index_path = (
-                Path(type_output_dir) / language / f"lexeme_index_{parse_type}.json"
-            )
+            index_path = Path(type_output_dir) / language / f"lexeme_{parse_type}.json"
             if check_index_exists(index_path):
                 return
         else:
-            index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json"
+            index_path = Path(type_output_dir) / f"lexeme_{parse_type}.json"
             if check_index_exists(index_path):
                 return
 
@@ -345,4 +360,4 @@ def parse_dump(
         # Save individual files for each valid language
         for iso_code in iso_codes:
             if iso_code in processor.iso_to_name:  # Only process known ISO codes
-                processor.save_index(str(index_path), iso_code)
+                processor.export_json(str(index_path), iso_code)

From 69f4bc70bd47eee2fb2c886a3b7a0a545f9be116 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sat, 28 Dec 2024 18:39:19 +0600
Subject: [PATCH 07/13] removed orjjson

---
 src/scribe_data/wiktionary/parse_dump.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index e4a121f5..1e15971f 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -21,9 +21,8 @@
 """
 
 import bz2
-import orjson
-import time
 import json
+import time
 
 from tqdm import tqdm
 from pathlib import Path
@@ -143,7 +142,7 @@ def process_lines(self, line: str) -> Dict[str, Any]:
         Process a single line of lexeme data.
         """
         try:
-            lexeme = orjson.loads(line.strip().rstrip(","))
+            lexeme = json.loads(line.strip().rstrip(","))
 
             if self.parse_type == "translations":
                 return self._process_lexeme_translations(lexeme)

From 612ebe5d736843a762d66dde59e514307ffdfe36 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Mon, 30 Dec 2024 01:46:32 +0600
Subject: [PATCH 08/13] Add orjson dependency and add forms & boost interactive
 mood

---
 requirements.txt                           |   1 +
 src/scribe_data/cli/get.py                 |  38 +-
 src/scribe_data/cli/interactive.py         |  44 ++
 src/scribe_data/cli/main.py                |   2 +-
 src/scribe_data/cli/total.py               |  13 +-
 src/scribe_data/utils.py                   |  13 +-
 src/scribe_data/wikidata/wikidata_utils.py |  41 +-
 src/scribe_data/wiktionary/parse_dump.py   | 683 ++++++++++++++-------
 tests/cli/test_get.py                      |  71 ++-
 9 files changed, 611 insertions(+), 295 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index abbd5e44..4e1d6d55 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ ruff>=0.3.3
 SPARQLWrapper>=2.0.0
 sphinx-rtd-theme>=3.0.0
 tqdm==4.66.4
+orjson>=3.10.12
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 4bfa0f37..118badb0 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -117,8 +117,12 @@ def prompt_user_download_all():
     if all_bool:
         if language:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump()
-
+                parse_wd_lexeme_dump(
+                    language=language,
+                    wikidata_dump_type=["form"],
+                    data_types=data_types,
+                    type_output_dir=output_dir,
+                )
             else:
                 language_or_sub_language = language.split(" ")[0]
                 print(f"Updating all data types for language: {language.title()}")
@@ -134,8 +138,12 @@ def prompt_user_download_all():
 
         elif data_type:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump()
-
+                parse_wd_lexeme_dump(
+                    language=None,
+                    wikidata_dump_type=["form"],
+                    data_types=[data_type],
+                    type_output_dir=output_dir,
+                )
             else:
                 print(f"Updating all languages for data type: {data_type.capitalize()}")
                 query_data(
@@ -153,7 +161,13 @@ def prompt_user_download_all():
             rprint(
                 "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
             )
-            parse_wd_lexeme_dump()
+            parse_wd_lexeme_dump(
+                language="all",
+                wikidata_dump_type=["form", "translations"],
+                data_types="all",
+                type_output_dir=output_dir,
+                wikidata_dump_path=wikidata_dump,
+            )
 
     # MARK: Emojis
 
@@ -165,7 +179,19 @@ def prompt_user_download_all():
     elif data_type == "translations":
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="translations",
+            wikidata_dump_type=["translations"],
+            type_output_dir=output_dir,
+            wikidata_dump_path=wikidata_dump,
+        )
+        return
+
+    # MARK: Query Data using Wikidata Dump
+
+    elif wikidata_dump:
+        parse_wd_lexeme_dump(
+            language=language,
+            wikidata_dump_type=["form"],
+            data_types=data_types,
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
         )
diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index 73e12426..8cfea57c 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -38,8 +38,10 @@
 from scribe_data.cli.get import get_data
 from scribe_data.cli.total import total_wrapper
 from scribe_data.cli.version import get_version_message
+from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 from scribe_data.utils import (
     DEFAULT_JSON_EXPORT_DIR,
+    DEFAULT_DUMP_EXPORT_DIR,
     data_type_metadata,
     language_metadata,
     list_all_languages,
@@ -262,6 +264,7 @@ def request_total_lexeme_loop():
             choices=[
                 Choice("Configure total lexemes request", "total"),
                 Choice("Run total lexemes request", "run"),
+                Choice("Run total lexemes request with lexeme dumps", "run_all"),
                 Choice("Exit", "exit"),
             ],
         ).ask()
@@ -275,6 +278,18 @@ def request_total_lexeme_loop():
             config.selected_languages, config.selected_data_types = [], []
             rprint(THANK_YOU_MESSAGE)
             break
+        elif choice == "run_all":
+            if wikidata_dump_path := prompt(
+                f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
+            ):
+                wikidata_dump_path = Path(wikidata_dump_path)
+
+            parse_wd_lexeme_dump(
+                language=config.selected_languages,
+                wikidata_dump_type=["total"],
+                wikidata_dump_path=wikidata_dump_path,
+            )
+            break
         elif choice == "exit":
             return
         else:
@@ -335,6 +350,12 @@ def start_interactive_mode(operation: str = None):
                     # Choice("See list of languages", "languages"),
                     Choice("Exit", "exit"),
                 ]
+            elif operation == "translations":
+                choices = [
+                    Choice("Configure translations request", "translations"),
+                    # Choice("See list of languages", "languages"),
+                    Choice("Exit", "exit"),
+                ]
 
         else:
             choices = [
@@ -358,6 +379,29 @@ def start_interactive_mode(operation: str = None):
             request_total_lexeme_loop()
             break
 
+        elif choice == "translations":
+            prompt_for_languages()
+
+            if wikidata_dump_path := prompt(
+                f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
+            ):
+                wikidata_dump_path = Path(wikidata_dump_path)
+
+            if output_dir := prompt(
+                f"Enter output directory (default: {config.output_dir}): "
+            ):
+                config.output_dir = Path(output_dir)
+
+            parse_wd_lexeme_dump(
+                language=config.selected_languages,
+                wikidata_dump_type=["translations"],
+                data_types=None,
+                type_output_dir=config.output_dir,
+                wikidata_dump_path=wikidata_dump_path,
+            )
+
+            break
+
         # elif choice == "languages":
         #     see_list_languages()
         #     break
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 3f7d045e..f696b238 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -436,7 +436,7 @@ def main() -> None:
             elif action == "Get data":
                 start_interactive_mode(operation="get")
             elif action == "Get translations":
-                print("Coming soon!")
+                start_interactive_mode(operation="translations")
             else:
                 print("Skipping action")
         else:
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 37016c97..e543256e 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -392,25 +392,22 @@ def total_wrapper(
             The local Wikidata dump path that can be used to process data.
             If True, indicates the flag was used without a path.
     """
+    # Handle --all flag
+    if all_bool and wikidata_dump:
+        language = "all"
 
     if wikidata_dump is True:  # flag without a wikidata dump path
-        if all_bool:
-            language = "all"
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="total",
-            type_output_dir=None,
+            wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
         return
 
     if isinstance(wikidata_dump, str):  # if user provided a wikidata dump path
-        if all_bool:
-            language = "all"
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="total",
-            type_output_dir=None,
+            wikidata_dump_type=["total"],
             wikidata_dump_path=wikidata_dump,
         )
         return
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 36f25877..163af4ae 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -706,16 +706,27 @@ def check_lexeme_dump_prompt_download(output_dir: str):
             return True
 
 
-def check_index_exists(index_path: Path) -> bool:
+def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
     """
     Check if JSON wiktionary dump file exists and prompt user for action if it does.
+    Returns True if user chooses to skip (i.e., we do NOT proceed).
+    Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed).
+
+    Parameters:
+        index_path: Path to check
+        overwrite_all: If True, automatically overwrite without prompting
     """
     if index_path.exists():
+        if overwrite_all:
+            return False
+
         print(f"\nIndex file already exists at: {index_path}")
         choice = questionary.select(
             "Choose an action:",
             choices=["Overwrite existing data", "Skip process"],
             default="Skip process",
         ).ask()
+
+        # If user selects "Skip process", return True meaning "don't proceed"
         return choice == "Skip process"
     return False
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 7ab46f7a..5e29f2e1 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -23,9 +23,11 @@
 from pathlib import Path
 from rich import print as rprint
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
+from typing import List, Union
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
 from scribe_data.wiktionary.parse_dump import parse_dump
+from scribe_data.utils import language_metadata, data_type_metadata
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
@@ -33,8 +35,9 @@
 
 
 def parse_wd_lexeme_dump(
-    language: str = None,
-    wikidata_dump_type: str = None,
+    language: Union[str, List[str]] = None,
+    wikidata_dump_type: List[str] = None,
+    data_types: List[str] = None,
     type_output_dir: str = None,
     wikidata_dump_path: str = None,
 ):
@@ -43,18 +46,28 @@ def parse_wd_lexeme_dump(
 
     Parameters
     ----------
-    language : str
-        The language to parse the data for.
-    wikidata_dump_type : str
-        The type of Wikidata dump to parse (e.g. "total", "translations").
-    type_output_dir : str
-        The directory to save the parsed JSON data.
-    wikidata_dump_path : str
+    language : Union[str, List[str]]
+        The language(s) to parse the data for. Use "all" for all languages.
+    wikidata_dump_type : List[str]
+        The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]).
+    data_types : List[str]
+        The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]).
+    type_output_dir : str, optional
+        The directory to save the parsed JSON data. If None, uses default directory.
+    wikidata_dump_path : str, optional
         The local Wikidata dump directory that should be used to get data.
-    Returns
-    -------
-        The requested data saved locally given file type and location arguments.
     """
+    # Convert "all" to list of all languages
+    if isinstance(language, str) and language.lower() == "all":
+        language = list(language_metadata.keys())
+    if isinstance(data_types, str) and data_types.lower() == "all":
+        # Exclude translations as it's a separate section
+        data_types = [
+            dt
+            for dt in data_type_metadata.keys()
+            if dt != "translations" and dt != "emoji-keywords"
+        ]
+
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
     if isinstance(file_path, (str, Path)):
@@ -67,10 +80,10 @@ def parse_wd_lexeme_dump(
             parse_dump(
                 language=language,
                 parse_type=wikidata_dump_type,
-                type_output_dir=type_output_dir,
+                data_types=data_types,
                 file_path=file_path,
+                output_dir=type_output_dir,
             )
-
             return
 
     rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 1e15971f..36bbbc69 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -21,65 +21,130 @@
 """
 
 import bz2
-import json
 import time
+import orjson
 
 from tqdm import tqdm
 from pathlib import Path
 from collections import defaultdict, Counter
-from typing import Dict, Any
+from typing import Union, List
 from scribe_data.utils import (
     DEFAULT_DUMP_EXPORT_DIR,
     language_metadata,
     data_type_metadata,
     check_index_exists,
 )
+import questionary
 
 
 class LexemeProcessor:
-    def __init__(self, target_iso: str = None, parse_type: str = None):
-        # Pre-compute lookups once during initialization
-        self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+    def __init__(
+        self,
+        target_iso: Union[str, List[str]] = None,
+        parse_type: List[str] = None,
+        data_types: List[str] = None,
+    ):
+        """
+        parse_type can be any combination of:
+            - 'translations'
+            - 'form'
+            - 'total'
+        data_types is a list of categories (e.g., ["nouns", "adverbs"]) for forms.
+        """
+        # Pre-compute sets for faster lookups
+        self.parse_type = set(parse_type or [])
+        self.data_types = set(data_types or [])
+        self.target_iso = set(
+            [target_iso] if isinstance(target_iso, str) else target_iso or []
+        )
+
+        # Pre-compute valid categories and languages
+        self._category_lookup = {v: k for k, v in data_type_metadata.items()}
+        self.valid_categories = set(data_type_metadata.values())
+
+        # Build optimized language mapping
+        self.iso_to_name = self._build_iso_mapping()
+        self.valid_iso_codes = set(self.iso_to_name.keys())
+
+        # Separate data structures
+        self.translations_index = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(dict))
+        )
+        self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+
+        # Stats
         self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
-        self.target_iso = target_iso
-        self.parse_type = parse_type
+
+        # For category lookups, invert data_type_metadata
+        # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}
+        self._category_lookup = {v: k for k, v in data_type_metadata.items()}
+
+        # Build map from ISO to full language name
+        self.iso_to_name = self._build_iso_mapping()
+
+        # For "total" usage
         self.lexical_category_counts = defaultdict(Counter)
         self.translation_counts = defaultdict(Counter)
-        self._category_lookup = {v: k for k, v in data_type_metadata.items() if v}
-        self.iso_to_name = self._build_iso_mapping()
+        self.forms_counts = defaultdict(Counter)
 
+    # MARK: build iso mapping
     def _build_iso_mapping(self) -> dict:
-        """Build mapping of ISO codes to language names"""
+        """
+        Build mapping of ISO codes to language names based on language_metadata.
+        If self.target_iso is non-null, only include those iso codes.
+        """
         iso_mapping = {}
-
         for lang_name, data in language_metadata.items():
-            if self.target_iso and lang_name != self.target_iso:
+            if self.target_iso and lang_name not in self.target_iso:
                 continue
+            iso_code = data.get("iso")
+            if iso_code:
+                iso_mapping[iso_code] = lang_name
+        return iso_mapping
 
-            if "iso" in data:
-                iso_mapping[data["iso"]] = lang_name
+    # MARK: process total
+    def _process_lexeme_total(self, lexeme: dict) -> None:
+        """
+        Gather stats if 'total' is in parse_type: how many entries per language & category,
+        how many translations, etc.
+        """
+        lexicalCategory = lexeme.get("lexicalCategory")
+        if not lexicalCategory or lexicalCategory not in data_type_metadata.values():
+            return
 
-            if not self.target_iso and "sub_languages" in data:
-                for sublang_data in data["sub_languages"].values():
-                    if "iso" in sublang_data:
-                        iso_mapping[sublang_data["iso"]] = lang_name
+        category_name = self._category_lookup.get(lexicalCategory)
+        if not category_name:
+            return
 
-        return iso_mapping
+        # Update counters
+        lemmas = lexeme.get("lemmas", {})
+        for lemma in lemmas.values():
+            lang = lemma.get("language")
+            if lang in self.iso_to_name:
+                self.lexical_category_counts[lang][category_name] += 1
+                translation_count = sum(
+                    len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
+                )
+                self.translation_counts[lang][category_name] += translation_count
+                break
 
-    def _process_lexeme_translations(self, lexeme: dict) -> dict:
-        """Process lexeme translations from lemmas and senses"""
+    # MARK: process translations
+    def _process_lexeme_translations(self, lexeme: dict) -> None:
+        """
+        Process gloss-based translations if 'translations' is in parse_type.
+        Store them in self.translations_index.
+        """
         lemmas = lexeme.get("lemmas", {})
         qid = lexeme.get("lexicalCategory")
 
-        # Early return if missing required data
         if not (lemmas and qid):
-            return {}
-        # Convert Qid to actual category name (e.g., Q1084 -> nouns)
+            return
+
         category_name = self._category_lookup.get(qid)
         if not category_name:
-            return {}
+            return
 
-        # Process first valid lemma
+        # Only store first valid lemma for translations
         for lang_code, lemma_data in lemmas.items():
             if lang_code not in self.iso_to_name:
                 continue
@@ -88,275 +153,431 @@ def _process_lexeme_translations(self, lexeme: dict) -> dict:
             if not word:
                 continue
 
-            # Collect all valid translations in one pass
+            # Build translations from sense glosses
             translations = {}
             for sense in lexeme.get("senses", []):
-                for lang_code, gloss in sense.get("glosses", {}).items():
-                    if lang_code in self.iso_to_name:
-                        translations[lang_code] = gloss["value"]
+                for sense_lang_code, gloss in sense.get("glosses", {}).items():
+                    if sense_lang_code in self.iso_to_name:
+                        translations[sense_lang_code] = gloss["value"]
 
             if translations:
-                self.word_index[word][lang_code][category_name] = translations
-                return {word: {lang_code: {category_name: translations}}}
-
-            break  # Process only first valid lemma
+                self.translations_index[word][lang_code][category_name] = translations
+            break  # Only handle the first lemma
 
-        return {}
-
-    def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]:
+    # MARK: process forms
+    def _process_lexeme_forms(self, lexeme: dict) -> None:
         """
-        Process lexeme forms from lemmas, datatype and senses.
-        Returns a dictionary with word translations or empty dict if invalid.
+        Process forms for categories in self.data_types if 'form' is in parse_type.
+        Store them in self.forms_index.
         """
-
-        lexicalCategory = lexeme.get("lexicalCategory")
-
-        # Skip if lexicalCategory is missing or not in our data types
-        if not lexicalCategory or lexicalCategory not in data_type_metadata.values():
-            return {}
         lemmas = lexeme.get("lemmas", {})
+        lexical_category = lexeme.get("lexicalCategory")
 
-        category_name = self._category_lookup.get(lexicalCategory)
+        # Skip if category missing or not recognized
+        if not lexical_category or lexical_category not in data_type_metadata.values():
+            return
+
+        # Convert Q1084 -> "nouns", etc.
+        category_name = self._category_lookup.get(lexical_category)
         if not category_name:
-            return {}
+            return
 
-        # Process only the first valid language entry
-        for lemma in lemmas.values():
-            lang = lemma.get("language")
-            if lang in self.iso_to_name:
-                if lang not in self.lexical_category_counts:
-                    self.lexical_category_counts[lang] = Counter()
-                    self.translation_counts[lang] = Counter()
-                # Update counts
-                self.lexical_category_counts[lang][category_name] += 1
-                translation_count = sum(
-                    len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
-                )
-                self.translation_counts[lang][category_name] += translation_count
-                break
+        # If the category_name is NOT in our data_types list, skip
+        # e.g., category_name = "nouns", but user didn't request "nouns" in data_types
+        if category_name not in self.data_types:
+            return
+
+        # Process forms
+        for lang_code, lemma_data in lemmas.items():
+            if lang_code not in self.iso_to_name:
+                continue
 
-        return {}
+            word = lemma_data.get("value", "").lower()
+            if not word:
+                continue
+
+            forms_data = defaultdict(list)
+            for form in lexeme.get("forms", []):
+                representations = form.get("representations", {})
+                grammatical_features = form.get("grammaticalFeatures", [])
 
-    def process_lines(self, line: str) -> Dict[str, Any]:
+                for rep_lang, rep_data in representations.items():
+                    if rep_lang == lang_code:
+                        form_value = rep_data.get("value")
+                        if form_value:
+                            forms_data[form_value].extend(grammatical_features)
+
+            if forms_data:
+                self.forms_index[word][lang_code][category_name] = dict(forms_data)
+                self.forms_counts[lang_code][category_name] += len(forms_data)
+            break  # only first valid lemma
+
+    # MARK: process lines
+    def process_lines(self, line: str) -> None:
         """
-        Process a single line of lexeme data.
+        Process one line of data. Depending on parse_type, we do:
+           - total stats
+           - translations
+           - form categories (filtered by data_types)
         """
         try:
-            lexeme = json.loads(line.strip().rstrip(","))
+            lexeme = orjson.loads(line.strip().rstrip(","))
+            if not lexeme:
+                return
+
+            # Get common values once
+            lemmas = lexeme.get("lemmas", {})
+            lexical_category = lexeme.get("lexicalCategory")
+
+            if not (lemmas and lexical_category in self.valid_categories):
+                return
+
+            category_name = self._category_lookup.get(lexical_category)
+            if not category_name:
+                return
+
+            # Process each type in a single pass through the data
+            for lang_code, lemma_data in lemmas.items():
+                if lang_code not in self.valid_iso_codes:
+                    continue
+
+                word = lemma_data.get("value", "").lower()
+                if not word:
+                    continue
+
+                if "total" in self.parse_type:
+                    self.lexical_category_counts[lang_code][category_name] += 1
+                    translation_count = sum(
+                        len(sense.get("glosses", {}))
+                        for sense in lexeme.get("senses", [])
+                    )
+                    self.translation_counts[lang_code][category_name] += (
+                        translation_count
+                    )
+
+                if "translations" in self.parse_type:
+                    translations = {
+                        lang: gloss["value"]
+                        for sense in lexeme.get("senses", [])
+                        for lang, gloss in sense.get("glosses", {}).items()
+                        if lang in self.valid_iso_codes
+                    }
+                    if translations:
+                        self.translations_index[word][lang_code][category_name] = (
+                            translations
+                        )
 
-            if self.parse_type == "translations":
-                return self._process_lexeme_translations(lexeme)
-            elif self.parse_type == "total":
-                return self._process_lexeme_total(lexeme)
+                if "form" in self.parse_type and category_name in self.data_types:
+                    forms_data = defaultdict(list)
+                    for form in lexeme.get("forms", []):
+                        for rep_lang, rep_data in form.get(
+                            "representations", {}
+                        ).items():
+                            if rep_lang == lang_code:
+                                form_value = rep_data.get("value")
+                                if form_value:
+                                    forms_data[form_value].extend(
+                                        form.get("grammaticalFeatures", [])
+                                    )
+
+                    if forms_data:
+                        self.forms_index[word][lang_code][category_name] = dict(
+                            forms_data
+                        )
+                        self.forms_counts[lang_code][category_name] += len(forms_data)
+
+                break  # Only process first valid lemma
 
         except Exception as e:
             print(f"Error processing line: {e}")
-            return {}
 
+    # MARK: process file
     def process_file(self, file_path: str, batch_size: int = 50000):
-        start_time = time.time()
-
-        try:
-            # Get file size and estimate number of entries (average 263 bytes per entry based on real data)
+        """
+        Main loop: read lines from file (bz2) in batches, call process_lines on each.
+        """
+        # Use context manager for better resource handling
+        with bz2.open(file_path, "rt", encoding="utf-8") as bzfile:
+            # Skip header if present
+            first_line = bzfile.readline()
+            if not first_line.strip().startswith("["):
+                bzfile.seek(0)
+
+            # Process in larger batches for better performance
+            batch = []
+            start_time = time.time()
             total_entries = int(Path(file_path).stat().st_size / 263)
 
-            with bz2.open(file_path, "rt", encoding="utf-8") as bzfile:
-                first_line = bzfile.readline()
-                if not first_line.strip().startswith("["):
-                    bzfile.seek(0)
-
-                batch = []
-                for line in tqdm(
-                    bzfile, desc="Processing entries", total=total_entries
-                ):
-                    stripped_line = line.strip()
-                    if stripped_line in [
-                        "]",
-                        "[",
-                        ",",
-                        "",
-                    ]:  # Skip structural JSON elements
-                        continue
-
+            for line in tqdm(bzfile, total=total_entries, desc="Processing entries"):
+                if line.strip() not in ["[", "]", ",", ""]:
                     batch.append(line)
-
                     if len(batch) >= batch_size:
                         self._process_batch(batch)
-                        batch = []
-
+                        batch.clear()  # More efficient than creating new list
                     self.stats["processed_entries"] += 1
 
-                # Process remaining items
-                if batch:
-                    self._process_batch(batch)
+            # Process remaining items
+            if batch:
+                self._process_batch(batch)
 
-            self.stats["processing_time"] = time.time() - start_time
-            self.stats["unique_words"] = len(self.word_index)
-            print(
-                f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds"
-            )
-            if self.parse_type == "total":
-                print(
-                    f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}"
-                )
-                print("=" * 90)
-
-                # Print counts for each language
-                for lang, counts in self.lexical_category_counts.items():
-                    lang_name = self.iso_to_name[lang]
-                    # Print first row with language name
-                    first_category = True
-                    for category, count in counts.most_common():
-                        translation_count = self.translation_counts[lang][category]
-                        if first_category:
-                            print(
-                                f"{lang_name:<20} {category:<25} {count:<25,} {translation_count:<20,}"
-                            )
-                            first_category = False
-                        else:
-                            # Print subsequent rows with blank language column
-                            print(
-                                f"{'':<20} {category:<25} {count:<25,} {translation_count:<20,}"
-                            )
-                    # Add blank line between languages, but not after the last language
-                    if lang != list(self.lexical_category_counts.keys())[-1]:
-                        print(
-                            f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}"
-                        )
-                        print("=" * 90)
+        # Update stats
+        self.stats["processing_time"] = time.time() - start_time
+        self.stats["unique_words"] = len(self.forms_index) + len(
+            self.translations_index
+        )
 
-        except FileNotFoundError:
-            print(f"Error: File not found - {file_path}")
-        except Exception as e:
-            print(f"Error processing file: {e}")
+        # Print summary if "total" was requested
+        if "total" in self.parse_type:
+            self._print_total_summary()
 
     def _process_batch(self, batch: list) -> None:
         """
-        Process multiple lines at once
+        Process a batch of lines
         """
         for line in batch:
             self.process_lines(line)
 
-    def export_json(self, filepath: str, language_iso: str = None) -> None:
+    # MARK: print total summary
+    def _print_total_summary(self):
         """
-        Save index to file, optionally filtering by language ISO code.
+        Print stats if parse_type == total
+        """
+        print(
+            f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25} {'Total Translations':<20}"
+        )
+        print("=" * 90)
+        for lang, counts in self.lexical_category_counts.items():
+            lang_name = self.iso_to_name[lang]
+            first_row = True
+            for category, count in counts.most_common():
+                trans_count = self.translation_counts[lang][category]
+                if first_row:
+                    print(
+                        f"{lang_name:<20} {category:<25} {count:<25,} {trans_count:<20,}"
+                    )
+                    first_row = False
+                else:
+                    print(f"{'':<20} {category:<25} {count:<25,} {trans_count:<20,}")
+            if lang != list(self.lexical_category_counts.keys())[-1]:
+                print("\n" + "=" * 90 + "\n")
+
+    # MARK: export translations
+    def export_translations_json(self, filepath: str, language_iso: str = None) -> None:
+        """
+        Save translations_index to file, optionally filtering by language_iso.
         """
         if language_iso:
-            # Only proceed if we have a valid ISO code
             if language_iso not in self.iso_to_name:
-                print(f"Warning: Unknown ISO code {language_iso}, skipping...")
+                print(
+                    f"Warning: ISO {language_iso} unknown, skipping translations export..."
+                )
                 return
+            # Filter
+            filtered = {}
+            for word, lang_data in self.translations_index.items():
+                if language_iso in lang_data:
+                    filtered[word] = {language_iso: lang_data[language_iso]}
+
+            self._save_by_language(filtered, filepath, language_iso, "translations")
 
-            # Get full language name
-            full_language_name = self.iso_to_name[language_iso]
+    # MARK: export forms
+    def export_forms_json(
+        self, filepath: str, language_iso: str = None, data_type: str = None
+    ) -> None:
+        """
+        Save forms_index to file, optionally filtering by:
+         - language_iso
+         - data_type (e.g. "nouns", "adverbs")
 
-            # Filter word_index for specific language
-            filtered_index = {}
-            for word, lang_data in self.word_index.items():
+        If data_type is given, we only export that one category from forms.
+        """
+        if language_iso:
+            if language_iso not in self.iso_to_name:
+                print(f"Warning: ISO {language_iso} unknown, skipping forms export...")
+                return
+            filtered = {}
+            for word, lang_data in self.forms_index.items():
                 if language_iso in lang_data:
-                    filtered_index[word] = {language_iso: lang_data[language_iso]}
-
-            # Create language-specific filepath, removing potential double paths
-            base_path = Path(filepath)
-            # Remove language name from base_path if it exists to prevent duplication
-            if full_language_name in base_path.parts:
-                parts = [p for p in base_path.parts if p != full_language_name]
-                base_path = Path(*parts)
-
-            lang_filepath = base_path.parent / full_language_name / base_path.name
-            lang_filepath.parent.mkdir(parents=True, exist_ok=True)
-
-            print(f"Saving {full_language_name} index to {lang_filepath}...")
-            with open(lang_filepath, "w", encoding="utf-8") as f:
-                json.dump(filtered_index, f, indent=2, ensure_ascii=False)
-        else:
-            print(f"Saving complete index to {filepath}...")
-            with open(filepath, "w", encoding="utf-8") as f:
-                json.dump(
-                    self._convert_defaultdict_to_dict(self.word_index),
-                    f,
-                    indent=2,
-                    ensure_ascii=False,
+                    # If data_type is given, only keep that category
+                    if data_type:
+                        if data_type in lang_data[language_iso]:
+                            filtered[word] = {
+                                language_iso: {
+                                    data_type: lang_data[language_iso][data_type]
+                                }
+                            }
+                    else:
+                        filtered[word] = {language_iso: lang_data[language_iso]}
+            self._save_by_language(
+                filtered, filepath, language_iso, data_type or "forms"
+            )
+
+    def _save_by_language(self, data, filepath, language_iso, category_type):
+        """
+        Save data to exports/<langName>/filename
+        """
+        base_path = Path(filepath)
+        lang_name = self.iso_to_name[language_iso]
+
+        lang_filepath = base_path.parent / lang_name / base_path.name
+        lang_filepath.parent.mkdir(parents=True, exist_ok=True)
+
+        print(f"Saving {lang_name} {category_type} index to {lang_filepath}...")
+        with open(lang_filepath, "wb") as f:
+            f.write(
+                orjson.dumps(
+                    self._to_dict(data),
+                    option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS,
                 )
+            )
 
-    def _convert_defaultdict_to_dict(self, dd):
+    def _to_dict(self, dd):
+        """
+        Recursively convert defaultdict to dict.
+        """
         if isinstance(dd, defaultdict):
-            dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()}
+            dd = {k: self._to_dict(v) for k, v in dd.items()}
         return dd
 
 
+# MARK: parse dump
 def parse_dump(
-    language: str = None,
-    parse_type: str = None,
-    type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR,
+    language: Union[str, List[str]] = None,
+    parse_type: List[str] = None,
+    data_types: List[str] = None,
     file_path: str = "latest-lexemes.json.bz2",
+    output_dir: str = None,
+    overwrite_all: bool = False,
 ):
     """
-    Process and parse Wikidata lexeme dumps, either analyzing all
-    or filtering for a specific language.
+    Parse a Wikidata lexeme dump file and extract linguistic data.
 
     Parameters
     ----------
-    language : str,
-        ISO code of the language to process. If 'all', processes all languages.
-    parse_type : str
-        Type of parsing to perform. Options are:
-        - 'total': Generate statistics about lexeme counts
-        - 'translations': Create translation indexes
-    type_output_dir : str
-        Directory where output files will be saved. Defaults to DEFAULT_DUMP_EXPORT_DIR.
-    file_path : str
-        Path to the lexeme dump file. Defaults to 'latest-lexemes.json.bz2'.
+    language : str or list of str, optional
+        Language(s) to parse data for. Must match language names in language_metadata.
+    parse_type : list of str, optional
+        Types of parsing to perform. Valid options are:
+        - 'translations': Extract word translations
+        - 'form': Extract grammatical forms
+        - 'total': Gather statistical totals
+    data_types : list of str, optional
+        Categories to parse when using 'form' type (e.g. ["nouns", "adverbs"]).
+        Only used if 'form' is in parse_type.
+    file_path : str, default="latest-lexemes.json.bz2"
+        Path to the lexeme dump file
+    output_dir : str, optional
+        Directory to save output files. If None, uses DEFAULT_DUMP_EXPORT_DIR.
+    overwrite_all : bool, default=False
+        If True, automatically overwrite existing files without prompting
 
     Notes
     -----
-    When parse_type is 'total':
-    - Total number of lexemes per language along with different lexical categories
-    - Number of total translations available
-
-    When parse_type is 'translations', it creates JSON index files containing:
-    - Word-to-translation mappings
-    - Lexical category information
+    The function processes a Wikidata lexeme dump and extracts linguistic data based on
+    the specified parameters. For each language and data type combination, it creates
+    separate JSON files in the output directory structure:
 
+    If a requested index file already exists, that language/category combination
+    will be skipped.
     """
-    if parse_type == "total":
-        if language == "all":
-            print("Processing all lexemes...")
-            processor = LexemeProcessor(target_iso=None, parse_type=parse_type)
-        else:
-            print(f"Processing lexemes for {language}...")
-            processor = LexemeProcessor(target_iso=language, parse_type=parse_type)
-
-        processor.process_file(file_path)
-
-    else:
-        # Create the output directory if it doesn't exist
-        Path(type_output_dir).mkdir(parents=True, exist_ok=True)
-
-        if language:
-            index_path = Path(type_output_dir) / language / f"lexeme_{parse_type}.json"
-            if check_index_exists(index_path):
-                return
-        else:
-            index_path = Path(type_output_dir) / f"lexeme_{parse_type}.json"
-            if check_index_exists(index_path):
-                return
-
-        print(f"Will save index to: {index_path}")
-
-        processor = LexemeProcessor(target_iso=language, parse_type=parse_type)
-
-        print("Processing the lexeme data file...")
-        processor.process_file(file_path)
-
-        print(f"Found {len(processor.word_index)} words in total")
-
-        # Get unique ISO codes from the processed data
+    # 1) Prepare environment - Use default if output_dir is None
+    output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Convert single strings to lists
+    languages = [language] if isinstance(language, str) else language
+    parse_type = parse_type or []
+    data_types = data_types or []
+
+    print(f"Languages: {languages}")
+    print(f"parse_type: {parse_type}")
+    if data_types:
+        print(f"data_types for forms: {data_types}")
+
+    if "total" not in parse_type:
+        choice = questionary.select(
+            "Choose an action:",
+            choices=["Overwrite existing data", "Skip process"],
+            default="Skip process",
+        ).ask()
+        if choice == "Overwrite existing data":
+            overwrite_all = True
+
+        # For translations, we only need to check the translations index
+        if "translations" in parse_type:
+            languages_to_process = []
+            for lang in languages:
+                index_path = Path(output_dir) / lang / "lexeme_translations.json"
+                if not check_index_exists(index_path, overwrite_all):
+                    languages_to_process.append(lang)
+                else:
+                    print(f"Skipping {lang}/translations.json - already exists")
+
+            # Update languages list but keep data_types as is
+            languages = languages_to_process
+
+        # For forms, check each language/data_type combination
+        elif "form" in parse_type:
+            languages_to_process = []
+            data_types_to_process = set()
+
+            for lang in languages:
+                needs_processing = False
+                for data_type in data_types:
+                    index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json"
+                    if not check_index_exists(index_path, overwrite_all):
+                        needs_processing = True
+                        data_types_to_process.add(data_type)
+                    else:
+                        print(f"Skipping {lang}/{data_type}.json - already exists")
+
+                if needs_processing:
+                    languages_to_process.append(lang)
+
+            # Update both lists
+            languages = languages_to_process
+            data_types = list(data_types_to_process)
+
+        print(f"Languages to process: {languages}")
+        if data_types:
+            print(f"Data types to process: {data_types}")
+
+        if not languages:
+            print("All requested data already exists. Nothing to process.")
+            return
+
+    processor = LexemeProcessor(
+        target_iso=languages, parse_type=parse_type, data_types=data_types
+    )
+    processor.process_file(file_path)
+
+    # MARK: Handle JSON exports
+
+    # (a) If "translations" in parse_type -> export them
+    if "translations" in parse_type:
+        index_path = Path(output_dir) / "lexeme_translations.json"
+
+        # Export translations for each ISO found
         iso_codes = set()
-        for word_data in processor.word_index.values():
+        for word_data in processor.translations_index.values():
             iso_codes.update(word_data.keys())
-
-        # Save individual files for each valid language
         for iso_code in iso_codes:
-            if iso_code in processor.iso_to_name:  # Only process known ISO codes
-                processor.export_json(str(index_path), iso_code)
+            if iso_code in processor.iso_to_name:
+                processor.export_translations_json(str(index_path), iso_code)
+
+    # (b) If "form" in parse_type -> export forms for each data_type in data_types
+    if "form" in parse_type:
+        # For each data_type, we create a separate file, e.g. lexeme_nouns.json
+        for dt in data_types:
+            index_path = Path(output_dir) / f"lexeme_{dt}.json"
+            print(f"Exporting forms for {dt} to {index_path}...")
+
+            iso_codes = set()
+            for word_data in processor.forms_index.values():
+                iso_codes.update(word_data.keys())
+
+            for iso_code in iso_codes:
+                if iso_code in processor.iso_to_name:
+                    processor.export_forms_json(
+                        filepath=str(index_path), language_iso=iso_code, data_type=dt
+                    )
diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
index 8cf75090..914fbe9e 100644
--- a/tests/cli/test_get.py
+++ b/tests/cli/test_get.py
@@ -62,37 +62,37 @@ def test_invalid_arguments(self):
 
     # MARK: All Data
 
-    @patch("scribe_data.cli.get.query_data")
-    @patch("builtins.input", lambda _: "N")  # don't use dump
-    def test_get_all_data_types_for_language(self, mock_query_data):
-        """
-        Test retrieving all data types for a specific language.
-
-        Ensures that `query_data` is called properly when `--all` flag is used with a language.
-        """
-        get_data(all_bool=True, language="English")
-        mock_query_data.assert_called_once_with(
-            languages=["English"],
-            data_type=None,
-            output_dir="scribe_data_json_export",
-            overwrite=False,
-        )
-
-    @patch("scribe_data.cli.get.query_data")
-    @patch("builtins.input", lambda _: "N")  # don't use dump
-    def test_get_all_languages_for_data_type(self, mock_query_data):
-        """
-        Test retrieving all languages for a specific data type.
-
-        Ensures that `query_data` is called properly when `--all` flag is used with a data type.
-        """
-        get_data(all_bool=True, data_type="nouns")
-        mock_query_data.assert_called_once_with(
-            languages=None,
-            data_type=["nouns"],
-            output_dir="scribe_data_json_export",
-            overwrite=False,
-        )
+    # @patch("scribe_data.cli.get.query_data")
+    # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False)
+    # def test_get_all_data_types_for_language(self, mock_prompt, mock_query_data):
+    #     """
+    #     Test retrieving all data types for a specific language.
+
+    #     Ensures that `query_data` is called properly when `--all` flag is used with a language.
+    #     """
+    #     get_data(all_bool=True, language="English")
+    #     mock_query_data.assert_called_once_with(
+    #         languages=["English"],
+    #         data_type=None,
+    #         output_dir="scribe_data_json_export",
+    #         overwrite=False,
+    #     )
+
+    # @patch("scribe_data.cli.get.query_data")
+    # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False)
+    # def test_get_all_languages_for_data_type(self, mock_prompt, mock_query_data):
+    #     """
+    #     Test retrieving all languages for a specific data type.
+
+    #     Ensures that `query_data` is called properly when `--all` flag is used with a data type.
+    #     """
+    #     get_data(all_bool=True, data_type="nouns")
+    #     mock_query_data.assert_called_once_with(
+    #         languages=None,
+    #         data_type=["nouns"],
+    #         output_dir="scribe_data_json_export",
+    #         overwrite=False,
+    #     )
 
     # MARK: Language and Data Type
 
@@ -115,7 +115,8 @@ def test_get_specific_language_and_data_type(self, mock_query_data):
     # MARK: Capitalized Language
 
     @patch("scribe_data.cli.get.query_data")
-    def test_get_data_with_capitalized_language(self, mock_query_data):
+    @patch("scribe_data.cli.get.Path.glob", return_value=[])
+    def test_get_data_with_capitalized_language(self, mock_glob, mock_query_data):
         """
         Test retrieving data with a capitalized language.
 
@@ -133,7 +134,8 @@ def test_get_data_with_capitalized_language(self, mock_query_data):
     # MARK: Lowercase Language
 
     @patch("scribe_data.cli.get.query_data")
-    def test_get_data_with_lowercase_language(self, mock_query_data):
+    @patch("scribe_data.cli.get.Path.glob", return_value=[])
+    def test_get_data_with_lowercase_language(self, mock_glob, mock_query_data):
         """
         Test retrieving data with a lowercase language.
 
@@ -171,7 +173,8 @@ def test_get_data_with_different_output_directory(self, mock_query_data):
     # MARK: Overwrite is True
 
     @patch("scribe_data.cli.get.query_data")
-    def test_get_data_with_overwrite_true(self, mock_query_data):
+    @patch("scribe_data.cli.get.Path.glob", return_value=[])
+    def test_get_data_with_overwrite_true(self, mock_glob, mock_query_data):
         """
         Test retrieving data with the overwrite flag set to True.
 

From 155647bc8a7055a9bac9a873175ca4928599b55d Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Mon, 30 Dec 2024 01:55:20 +0600
Subject: [PATCH 09/13] Set default language to "all" for translations in
 get_data function

---
 src/scribe_data/cli/get.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 118badb0..a7c92afb 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -177,6 +177,8 @@ def prompt_user_download_all():
     # MARK: Translations
 
     elif data_type == "translations":
+        if language is None:
+            language = "all"
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["translations"],

From 26aa1925c857d8a074526c472a115054da309f3f Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Mon, 30 Dec 2024 02:01:31 +0600
Subject: [PATCH 10/13] Removed extra welcome message from interactive mode

---
 src/scribe_data/cli/interactive.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index 7a0beb2d..f48b952f 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -37,7 +37,6 @@
 # from scribe_data.cli.list import list_wrapper
 from scribe_data.cli.get import get_data
 from scribe_data.cli.total import total_wrapper
-from scribe_data.cli.version import get_local_version
 from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 from scribe_data.utils import (
     DEFAULT_JSON_EXPORT_DIR,
@@ -331,7 +330,6 @@ def start_interactive_mode(operation: str = None):
         operation : str
             The type of operation that interactive mode is being ran with.
     """
-    rprint(f"[bold cyan]Welcome to {get_local_version()} interactive mode![/bold cyan]")
     while True:
         # Check if both selected_languages and selected_data_types are empty.
         if not config.selected_languages and not config.selected_data_types:

From 402493bebd5c6c2d929a167b0597edb006f6f69a Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sat, 4 Jan 2025 00:52:33 +0600
Subject: [PATCH 11/13] Add MediaWiki translation parsing functionality

---
 src/scribe_data/cli/main.py                   |   8 +-
 src/scribe_data/wikidata/wikidata_utils.py    |  23 +++
 src/scribe_data/wiktionary/parse_mediaWiki.py | 134 ++++++++++++++++++
 3 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 src/scribe_data/wiktionary/parse_mediaWiki.py

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 5a4818f2..8baa7873 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -37,6 +37,8 @@
 from scribe_data.cli.upgrade import upgrade_cli
 from scribe_data.cli.version import get_version_message
 
+from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations
+
 LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
 GET_DESCRIPTION = (
     "Get data from Wikidata and other sources for the given languages and data types."
@@ -168,6 +170,9 @@ def main() -> None:
         type=str,
         help="Path to a local Wikidata lexemes dump for running with '--all'.",
     )
+    get_parser.add_argument(
+        "-t", "--translation", type=str, help="parse a single word using MediaWiki API"
+    )
 
     # MARK: Total
 
@@ -359,7 +364,8 @@ def main() -> None:
         elif args.command in ["get", "g"]:
             if args.interactive:
                 start_interactive_mode(operation="get")
-
+            if args.translation:
+                parse_wiktionary_translations(args.translation)
             else:
                 get_data(
                     language=args.language.lower()
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 5e29f2e1..14e5fc02 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -24,6 +24,7 @@
 from rich import print as rprint
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
 from typing import List, Union
+import requests
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
 from scribe_data.wiktionary.parse_dump import parse_dump
@@ -34,6 +35,28 @@
 sparql.setMethod(POST)
 
 
+def mediaWiki_query(query: str) -> dict:
+    """
+    Query the Wikidata API using a MediaWiki query.
+
+    Parameters
+    ----------
+    query : str
+        The MediaWiki query to execute.
+
+    Returns
+    -------
+    dict
+        The JSON response from the API.
+    """
+    url = (
+        f"https://en.wiktionary.org/w/api.php?"
+        f"action=query&format=json&titles={query}/translations&prop=revisions&rvprop=content"
+    )
+    response = requests.get(url)
+    return response.json()
+
+
 def parse_wd_lexeme_dump(
     language: Union[str, List[str]] = None,
     wikidata_dump_type: List[str] = None,
diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py
new file mode 100644
index 00000000..5faa486a
--- /dev/null
+++ b/src/scribe_data/wiktionary/parse_mediaWiki.py
@@ -0,0 +1,134 @@
+"""
+   Functions to parse the translations of a word from MediaWiki API.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import re
+import json
+from scribe_data.wikidata.wikidata_utils import mediaWiki_query
+from scribe_data.utils import get_language_from_iso
+
+
+def fetch_translation_page(word):
+    data = mediaWiki_query(word)
+
+    pages = data.get("query", {}).get("pages", {})
+    # Extract page object from dictionary
+    page = next(iter(pages.values())) if pages else {}
+    # Get the wikitext from the 'revisions' key
+    wikitext = page.get("revisions", [{}])[0].get("*", "")
+    return wikitext
+
+
+def parse_wikitext_for_translations(wikitext):
+    """
+    Parse the wikitext line by line to extract translations,
+    language codes, part of speech, and context.
+    """
+    translations_by_lang = {}
+    current_part_of_speech = None  # Track whether we are in Noun or Verb
+    current_context = None  # Track the current trans-top context
+
+    # Split the wikitext into individual lines
+    for line in wikitext.splitlines():
+        # Detect part of speech/data-types: Noun or Verb
+        if line.startswith("===Noun==="):
+            current_part_of_speech = "Noun"
+        elif line.startswith("===Verb==="):
+            current_part_of_speech = "Verb"
+        trans_top_match = re.match(r"\{\{trans-top\|(.+?)\}\}", line)
+        if trans_top_match:
+            current_context = trans_top_match.group(1).strip()
+
+        template_match = re.match(
+            r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", line.strip()
+        )
+        if template_match:
+            lang_code = template_match.group(2).strip()
+            translation_text = template_match.group(3).strip()
+
+            # Ensure there's a list to hold translations for this language
+            if lang_code not in translations_by_lang:
+                translations_by_lang[lang_code] = []
+
+            translations_by_lang[lang_code].append(
+                {
+                    "translation": translation_text,
+                    "part_of_speech": current_part_of_speech,
+                    "context": current_context,
+                }
+            )
+
+    return translations_by_lang
+
+
+def build_json_format(word, translations_by_lang):
+    """
+    Build the final JSON format for the translations of a word.
+    """
+    book_translations = {word: {}}
+    # Keep counters to number the translations for each (lang, part_of_speech)
+    language_counters = {}
+
+    for lang_code, entries in translations_by_lang.items():
+        try:
+            lang_name = get_language_from_iso(lang_code)
+        except ValueError:
+            # Skip this language if it's not supported
+            continue
+
+        # Make sure this language is in the dictionary
+        if lang_name not in book_translations[word]:
+            book_translations[word][lang_name] = {}
+
+        for item in entries:
+            pos = item["part_of_speech"] or "Unknown"
+            desc = item["context"]
+            trans = item["translation"]
+
+            if pos not in book_translations[word][lang_name]:
+                book_translations[word][lang_name][pos] = {}
+                language_counters[(lang_code, pos)] = 1
+
+            idx = str(language_counters[(lang_code, pos)])
+
+            # Insert the item at the next available index
+            book_translations[word][lang_name][pos][idx] = {
+                "description": desc,
+                "translations": trans,
+            }
+            language_counters[(lang_code, pos)] += 1
+
+    return book_translations
+
+
+def parse_wiktionary_translations(word):
+    """
+    Parse the translations of a word from Wiktionary.
+    """
+    wikitext = fetch_translation_page(word)
+    translations_by_lang = parse_wikitext_for_translations(wikitext)
+
+    if not translations_by_lang:
+        print("No translations found")
+        return
+
+    final_json = build_json_format(word, translations_by_lang)
+    print(json.dumps(final_json, indent=4, ensure_ascii=False))

From 94b060a33ecfa184ea49b8b1e190d9892bf439e3 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sat, 4 Jan 2025 14:00:07 +0100
Subject: [PATCH 12/13] Formatting for all doc strings, spacing and minor
 improvements

---
 CONTRIBUTING.md                               |  13 +-
 .../check/check_project_metadata.py           |  26 +-
 .../check/check_project_structure.py          |  22 +-
 src/scribe_data/check/check_pyicu.py          |  29 ++-
 src/scribe_data/check/check_query_forms.py    |  98 ++++----
 .../check/check_query_identifiers.py          |  56 ++---
 src/scribe_data/cli/cli_utils.py              |  34 +--
 src/scribe_data/cli/convert.py                |  92 +++----
 src/scribe_data/cli/download.py               |  57 +++--
 src/scribe_data/cli/get.py                    |  48 ++--
 src/scribe_data/cli/interactive.py            |  45 ++--
 src/scribe_data/cli/list.py                   |  20 +-
 src/scribe_data/cli/main.py                   |  18 +-
 src/scribe_data/cli/total.py                  |  61 +++--
 .../unicode/generate_emoji_keywords.py        |  13 +-
 src/scribe_data/unicode/process_unicode.py    |  10 +-
 src/scribe_data/utils.py                      | 238 +++++++++---------
 src/scribe_data/wikidata/check_query/check.py |  65 ++---
 .../wikidata/check_query/sparql.py            |  26 +-
 src/scribe_data/wikidata/format_data.py       |  14 +-
 src/scribe_data/wikidata/query_data.py        |  32 +--
 src/scribe_data/wikidata/wikidata_utils.py    |  17 +-
 src/scribe_data/wikipedia/extract_wiki.py     |  86 +++----
 src/scribe_data/wikipedia/process_wiki.py     |  50 ++--
 src/scribe_data/wiktionary/parse_dump.py      | 154 +++++++-----
 src/scribe_data/wiktionary/parse_mediaWiki.py |  54 ++--
 tests/cli/test_download.py                    |  10 +-
 tests/wikidata/test_check_query.py            |   1 +
 28 files changed, 725 insertions(+), 664 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 924927ec..f929c112 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -300,13 +300,18 @@ def example_function(argument: argument_type) -> return_type:
 
     Parameters
     ----------
-        argument: argument_type
-            Description of your argument.
+    argument: argument_type
+        Description of your argument.
 
     Returns
     -------
-        return_value : return_type
-            Description of your return value.
+    return_value : return_type
+        Description of your return value.
+
+    Raises
+    ------
+    ErrorType
+        Description of the error and the condition that raises it.
     """
 
     ...
diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py
index 84523ba2..159d8ca2 100644
--- a/src/scribe_data/check/check_project_metadata.py
+++ b/src/scribe_data/check/check_project_metadata.py
@@ -88,16 +88,16 @@ def get_missing_languages(
 
     Parameters
     ----------
-        reference_languages : dict
-            A dictionary of languages from the reference source.
+    reference_languages : dict
+        A dictionary of languages from the reference source.
 
-        target_languages : dict
-            A dictionary of languages from the target source to check for missing entries.
+    target_languages : dict
+        A dictionary of languages from the target source to check for missing entries.
 
     Returns
     -------
-        list[str]
-            A list of languages and sub-languages that are in target_languages but not in reference_languages.
+    list[str]
+        A list of languages and sub-languages that are in target_languages but not in reference_languages.
     """
     missing_languages = []
     reference_keys = reference_languages.keys()
@@ -130,17 +130,17 @@ def validate_language_properties(languages_dict: dict) -> dict:
 
     Parameters
     ----------
-        languages_dict : dict
-            A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
+    languages_dict : dict
+        A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
 
     Returns
     -------
-        dict: A dictionary with two lists:
-            - "missing_qids": Languages or sub-languages missing the 'qid' property.
-            - "missing_isos": Languages or sub-languages missing the 'iso' property.
+    dict: A dictionary with two lists:
+        - "missing_qids": Languages or sub-languages missing the 'qid' property.
+        - "missing_isos": Languages or sub-languages missing the 'iso' property.
 
-            Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
-            or simply "parent_language" for the parent languages.
+        Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
+        or simply "parent_language" for the parent languages.
     """
     missing_qids = []
     missing_isos = []
diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py
index 612299df..edfe9085 100644
--- a/src/scribe_data/check/check_project_structure.py
+++ b/src/scribe_data/check/check_project_structure.py
@@ -48,24 +48,24 @@ def check_for_sparql_files(folder_path, data_type, language, subdir, missing_que
 
     Parameters
     ----------
-        folder_path : str
-            The path to the data-type folder.
+    folder_path : str
+        The path to the data-type folder.
 
-        data_type : str
-            The name of the data type being checked.
+    data_type : str
+        The name of the data type being checked.
 
-        language : str
-            The name of the language being processed.
+    language : str
+        The name of the language being processed.
 
-        subdir : str or None
-            The name of the sub-directory (for languages with sub-dialects), or None.
+    subdir : str or None
+        The name of the sub-directory (for languages with sub-dialects), or None.
 
-        missing_queries : list
-            A list to which missing SPARQL query files will be appended.
+    missing_queries : list
+        A list to which missing SPARQL query files will be appended.
 
     Returns
     -------
-        bool: True if at least one .sparql file is found, False otherwise.
+    bool: True if at least one .sparql file is found, False otherwise.
     """
     sparql_files = [f for f in os.listdir(folder_path) if f.endswith(".sparql")]
 
diff --git a/src/scribe_data/check/check_pyicu.py b/src/scribe_data/check/check_pyicu.py
index a2d645ce..456d4ed8 100644
--- a/src/scribe_data/check/check_pyicu.py
+++ b/src/scribe_data/check/check_pyicu.py
@@ -27,8 +27,8 @@
 from pathlib import Path
 
 import pkg_resources
+import questionary
 import requests
-from questionary import confirm
 
 
 def check_if_pyicu_installed():
@@ -90,15 +90,15 @@ def download_wheel_file(wheel_url, output_dir):
 
     Parameters
     ----------
-        wheel_url : str
-            The URL of the wheel file to download.
+    wheel_url : str
+        The URL of the wheel file to download.
 
-        output_dir : str
-            The directory to save the downloaded file.
+    output_dir : str
+        The directory to save the downloaded file.
 
     Returns
     -------
-        str : path to the downloaded wheel file.
+    str : path to the downloaded wheel file.
     """
     response = requests.get(wheel_url)
     response.raise_for_status()  # raise an error for bad responses
@@ -118,18 +118,18 @@ def find_matching_wheel(wheels, python_version, architecture):
 
     Parameters
     ----------
-        wheels : list
-            The list of available wheels.
+    wheels : list
+        The list of available wheels.
 
-        python_version : str
-            The Python version (e.g., 'cp311').
+    python_version : str
+        The Python version (e.g., 'cp311').
 
-        architecture : str
-            The architecture type (e.g., 'win_amd64').
+    architecture : str
+        The architecture type (e.g., 'win_amd64').
 
     Returns
     -------
-        str : The download URL of the matching wheel or None if not found.
+    str : The download URL of the matching wheel or None if not found.
     """
     return next(
         (
@@ -148,8 +148,7 @@ def check_and_install_pyicu():
         # Fetch available wheels from GitHub to estimate download size.
         wheels, total_size_mb = fetch_wheel_releases()
 
-        # Use questionary to ask for user confirmation
-        user_wants_to_proceed = confirm(
+        user_wants_to_proceed = questionary.confirm(
             f"{package_name} is not installed.\nScribe-Data can install the package and the needed dependencies."
             f"\nApproximately {total_size_mb:.2f} MB will be downloaded.\nDo you want to proceed?"
         ).ask()
diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py
index 12c4d96d..5435c844 100644
--- a/src/scribe_data/check/check_query_forms.py
+++ b/src/scribe_data/check/check_query_forms.py
@@ -57,18 +57,18 @@ def extract_forms_from_sparql(file_path: Path) -> str:
 
     Parameters
     ----------
-        file_path : Path
-            The path to the SPARQL query file from which to extract forms.
+    file_path : Path
+        The path to the SPARQL query file from which to extract forms.
 
     Returns
     -------
-        query_form_dict : dict
-            The file path with form labels of the query and their respective QIDs.
+    query_form_dict : dict
+        The file path with form labels of the query and their respective QIDs.
 
     Raises
     ------
-        FileNotFoundError
-            If the specified file does not exist.
+    FileNotFoundError
+        If the specified file does not exist.
     """
     optional_pattern = r"\s\sOPTIONAL\s*\{([^}]*)\}"
     try:
@@ -95,13 +95,13 @@ def extract_form_rep_label(form_text: str):
 
     Parameters
     ----------
-        form_text : str
-            The text that defines the form within the query.
+    form_text : str
+        The text that defines the form within the query.
 
     Returns
     -------
-        str
-            The label of the form representation.
+    str
+        The label of the form representation.
     """
     onto_rep_pattern = r"ontolex:representation .* ;"
     if line_match := re.search(pattern=onto_rep_pattern, string=form_text):
@@ -119,13 +119,13 @@ def decompose_label_features(label):
 
     Parameters
     ----------
-        label : str
-            The concatenated label string composed of several grammatical features.
+    label : str
+        The concatenated label string composed of several grammatical features.
 
     Returns
     -------
-        list
-            A list of grammatical features extracted from the label in their original order.
+    list
+        A list of grammatical features extracted from the label in their original order.
     """
     components = re.findall(r"[A-Za-z][^A-Z]*", label)
     valid_components = []
@@ -157,13 +157,13 @@ def extract_form_qids(form_text: str):
 
     Parameters
     ----------
-        form_text : str
-            The text that defines the form within the query.
+    form_text : str
+        The text that defines the form within the query.
 
     Returns
     -------
-        list[str]
-            All QIDS that make up the form.
+    list[str]
+        All QIDS that make up the form.
     """
     qids_pattern = r"wikibase:grammaticalFeature .+ \."
     if match := re.search(pattern=qids_pattern, string=form_text):
@@ -179,13 +179,13 @@ def check_form_label(form_text: str):
 
     Parameters
     ----------
-        form_text : str
-            The text that defines the form within the query.
+    form_text : str
+        The text that defines the form within the query.
 
     Returns
     -------
-        bool
-            Whether the form and its current representation label match (repForm and rep).
+    bool
+        Whether the form and its current representation label match (repForm and rep).
     """
     form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \."
 
@@ -221,13 +221,13 @@ def check_query_formatting(form_text: str):
 
     Parameters
     ----------
-        query_text : str
-            The SPARQL query text to check.
+    query_text : str
+        The SPARQL query text to check.
 
     Returns
     -------
-        bool
-            Whether there are formatting errors with the query.
+    bool
+        Whether there are formatting errors with the query.
     """
     # Check for spaces before commas that should not exist.
     if re.search(r"\s,", form_text):
@@ -249,13 +249,13 @@ def return_correct_form_label(qids: list):
 
     Parameters
     ----------
-        qids : list[str]
-            All QIDS that make up the form.
+    qids : list[str]
+        All QIDS that make up the form.
 
     Returns
     -------
-        correct_label : str
-            The label for the representation given the QIDs.
+    correct_label : str
+        The label for the representation given the QIDs.
     """
     if not qids:
         return "Invalid query formatting found"
@@ -289,14 +289,14 @@ def validate_forms(query_text: str) -> str:
 
     Parameters
     ----------
-        query_file : str
-            The SPARQL query text as a string.
+    query_file : str
+        The SPARQL query text as a string.
 
     Returns
     -------
-        str
-            Error message if there are any issues with the order of variables or forms,
-            otherwise an empty string.
+    str
+        Error message if there are any issues with the order of variables or forms,
+        otherwise an empty string.
     """
     select_pattern = r"SELECT\s+(.*?)\s+WHERE"
 
@@ -376,13 +376,13 @@ def check_docstring(query_text: str) -> bool:
 
     Parameters
     ----------
-        query_text : str
-            The SPARQL query's text to be checked.
+    query_text : str
+        The SPARQL query's text to be checked.
 
     Returns
     -------
-        bool
-            True if the docstring is correctly formatted.
+    bool
+        True if the docstring is correctly formatted.
     """
     # Split the text into lines.
     query_lines = query_text.splitlines(keepends=True)
@@ -418,14 +418,14 @@ def check_forms_order(query_text):
 
     Parameters
     ----------
-        query_text : str
-            The SPARQL query text containing the SELECT statement with variables.
+    query_text : str
+        The SPARQL query text containing the SELECT statement with variables.
 
     Returns
     -------
-        list or bool
-            A sorted list of variables if the ordering differs from the original,
-            otherwise a boolean indicating that the order matches.
+    list or bool
+        A sorted list of variables if the ordering differs from the original,
+        otherwise a boolean indicating that the order matches.
     """
     select_pattern = r"SELECT\s+(.*?)\s+WHERE"
 
@@ -496,14 +496,14 @@ def check_optional_qid_order(query_file: str) -> str:
 
     Parameters
     ----------
-        query_file : str
-            The path to the SPARQL query file to be checked.
+    query_file : str
+        The path to the SPARQL query file to be checked.
 
     Returns
     -------
-        str
-            A formatted string with details on any order mismatches in the QIDs, or an empty
-            string if all QIDs are correctly ordered.
+    str
+        A formatted string with details on any order mismatches in the QIDs, or an empty
+        string if all QIDs are correctly ordered.
     """
     forms = extract_forms_from_sparql(query_file)
     error_messages = []
diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py
index 3d1779e7..5337b972 100644
--- a/src/scribe_data/check/check_query_identifiers.py
+++ b/src/scribe_data/check/check_query_identifiers.py
@@ -41,21 +41,21 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool:
 
     Parameters
     ----------
-        query_file : Path
-            The path to the SPARQL query file being validated.
+    query_file : Path
+        The path to the SPARQL query file being validated.
 
-        lang_qid : str
-            The QID of the language extracted from the SPARQL query.
+    lang_qid : str
+        The QID of the language extracted from the SPARQL query.
 
     Returns
     -------
-        bool
-            True if the language QID is valid, otherwise False.
+    bool
+        True if the language QID is valid, otherwise False.
 
-    Example
-    -------
-        > is_valid_language(Path("path/to/query.sparql"), "Q123456")
-        True
+    Examples
+    --------
+    > is_valid_language(Path("path/to/query.sparql"), "Q123456")
+    True
     """
     lang_directory_name = query_file.parent.parent.name.lower()
     language_entry = language_metadata.get(lang_directory_name)
@@ -79,21 +79,21 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool:
 
     Parameters
     ----------
-        query_file : Path
-            The path to the SPARQL query file being validated.
+    query_file : Path
+        The path to the SPARQL query file being validated.
 
-        data_type_qid : str
-            The QID of the data type extracted from the SPARQL query.
+    data_type_qid : str
+        The QID of the data type extracted from the SPARQL query.
 
     Returns
     -------
-        bool
-            True if the data type QID is valid, otherwise False.
+    bool
+        True if the data type QID is valid, otherwise False.
 
-    Example
-    -------
-        > is_valid_data_type(Path("path/to/query.sparql"), "Q654321")
-        True
+    Examples
+    --------
+    > is_valid_data_type(Path("path/to/query.sparql"), "Q654321")
+    True
     """
     directory_name = query_file.parent.name  # e.g., "nouns" or "verbs"
     expected_data_type_qid = data_type_metadata.get(directory_name)
@@ -107,21 +107,21 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str:
 
     Parameters
     ----------
-        file_path : Path
-            The path to the SPARQL query file from which to extract the QID.
+    file_path : Path
+        The path to the SPARQL query file from which to extract the QID.
 
-        pattern : str
-            The regex pattern used to match the QID (either for language or data type).
+    pattern : str
+        The regex pattern used to match the QID (either for language or data type).
 
     Returns
     -------
-        str
-            The extracted QID if found, otherwise None.
+    str
+        The extracted QID if found, otherwise None.
 
     Raises
     ------
-        FileNotFoundError
-            If the specified file does not exist.
+    FileNotFoundError
+        If the specified file does not exist.
     """
     try:
         with open(file_path, "r", encoding="utf-8") as file:
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
index 5c8cd14b..6fa46651 100644
--- a/src/scribe_data/cli/cli_utils.py
+++ b/src/scribe_data/cli/cli_utils.py
@@ -34,12 +34,12 @@ def correct_data_type(data_type: str) -> str:
 
     Parameters
     ----------
-        data_type : str
-            The data type to potentially correct.
+    data_type : str
+        The data type to potentially correct.
 
     Returns
     -------
-        The data_type value or a corrected version of it.
+    The data_type value or a corrected version of it.
     """
     all_data_types = data_type_metadata.keys()
 
@@ -122,16 +122,16 @@ def validate_language_and_data_type(
 
     Parameters
     ----------
-        language : str or list
-            The language(s) to validate.
+    language : str or list
+        The language(s) to validate.
 
-        data_type : str or list
-            The data type(s) to validate.
+    data_type : str or list
+        The data type(s) to validate.
 
     Raises
     ------
-        ValueError
-            If any of the languages or data types is invalid, with all errors reported together.
+    ValueError
+        If any of the languages or data types is invalid, with all errors reported together.
     """
 
     def validate_single_item(item, valid_options, item_type):
@@ -140,19 +140,19 @@ def validate_single_item(item, valid_options, item_type):
 
         Parameters
         ----------
-            item : str
-                The item to validate.
-            valid_options : list
+        item : str
+            The item to validate.
 
-                A list of valid options against which the item will be validated.
+        valid_options : list
+            A list of valid options against which the item will be validated.
 
-            item_type : str
-                A description of the item type (e.g., "language", "data-type") used in error messages.
+        item_type : str
+            A description of the item type (e.g., "language", "data-type") used in error messages.
 
         Returns
         -------
-            str or None
-                Returns an error message if the item is invalid, or None if the item is valid.
+        str or None
+            Returns an error message if the item is invalid, or None if the item is valid.
         """
         if (
             isinstance(item, str)
diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py
index a2e2f777..27518244 100644
--- a/src/scribe_data/cli/convert.py
+++ b/src/scribe_data/cli/convert.py
@@ -51,30 +51,30 @@ def convert_to_json(
 
     Parameters
     ----------
-        language : str
-            The language of the file to convert.
+    language : str
+        The language of the file to convert.
 
-        data_type : Union[str, List[str]]
-            The data type of the file to convert.
+    data_type : Union[str, List[str]]
+        The data type of the file to convert.
 
-        output_type : str
-            The output format, should be "json".
+    output_type : str
+        The output format, should be "json".
 
-        input_file : str
-            The input CSV/TSV file path.
+    input_file : str
+        The input CSV/TSV file path.
 
-        output_dir : Path
-            The output directory path for results.
+    output_dir : Path
+        The output directory path for results.
 
-        overwrite : bool
-            Whether to overwrite existing files.
+    overwrite : bool
+        Whether to overwrite existing files.
 
-        identifier_case : str
-            The case format for identifiers. Default is "camel".
+    identifier_case : str
+        The case format for identifiers. Default is "camel".
 
     Returns
     -------
-        None
+    None
     """
     if not language:
         raise ValueError(f"Language '{language.capitalize()}' is not recognized.")
@@ -205,30 +205,30 @@ def convert_to_csv_or_tsv(
 
     Parameters
     ----------
-        language : str
-            The language of the file to convert.
+    language : str
+        The language of the file to convert.
 
-        data_type : Union[str, List[str]]
-            The data type of the file to convert.
+    data_type : Union[str, List[str]]
+        The data type of the file to convert.
 
-        output_type : str
-            The output format, should be "csv" or "tsv".
+    output_type : str
+        The output format, should be "csv" or "tsv".
 
-        input_file : str
-            The input JSON file path.
+    input_file : str
+        The input JSON file path.
 
-        output_dir : str
-            The output directory path for results.
+    output_dir : str
+        The output directory path for results.
 
-        overwrite : bool
-            Whether to overwrite existing files.
+    overwrite : bool
+        Whether to overwrite existing files.
 
-        identifier_case : str
-            The case format for identifiers. Default is "camel".
+    identifier_case : str
+        The case format for identifiers. Default is "camel".
 
     Returns
     -------
-        None
+    None
     """
     if not language:
         raise ValueError(f"Language '{language.capitalize()}' is not recognized.")
@@ -391,30 +391,30 @@ def convert_to_sqlite(
 
     Parameters
     ----------
-        language : str
-            The language of the file to convert.
+    language : str
+        The language of the file to convert.
 
-        data_type : str
-            The data type of the file to convert.
+    data_type : str
+        The data type of the file to convert.
 
-        output_type : str
-            The output format, should be "sqlite".
+    output_type : str
+        The output format, should be "sqlite".
 
-        input_file : Path
-            The input file path for the data to be converted.
+    input_file : Path
+        The input file path for the data to be converted.
 
-        output_dir : Path
-            The output directory path for results.
+    output_dir : Path
+        The output directory path for results.
 
-        overwrite : bool
-            Whether to overwrite existing files.
+    overwrite : bool
+        Whether to overwrite existing files.
 
-        identifier_case : str
-            The case format for identifiers. Default is "camel".
+    identifier_case : str
+        The case format for identifiers. Default is "camel".
 
     Returns
     -------
-        A SQLite file saved in the given location.
+    A SQLite file saved in the given location.
     """
     if input_file:
         input_file = Path(input_file)
@@ -487,7 +487,7 @@ def convert_wrapper(
 
     Returns
     -------
-        None
+    None
     """
     output_type = output_type.lower()
 
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
index 4ce478e0..f7f29adf 100644
--- a/src/scribe_data/cli/download.py
+++ b/src/scribe_data/cli/download.py
@@ -27,10 +27,10 @@
 from pathlib import Path
 from typing import Optional
 
+import questionary
 import requests
 from rich import print as rprint
 from tqdm import tqdm
-import questionary
 
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
 
@@ -46,13 +46,16 @@ def parse_date(date_string):
 
     Parameters
     ----------
-        date_string : str
-            The date string to be parsed.
+    date_string : str
+        The date string to be parsed.
 
     Returns
     -------
-        datetime.date : Parsed date object if the format is valid.
-        None : If the date format is invalid.
+    datetime.date
+        Parsed date object if the format is valid.
+
+    None
+        If the date format is invalid.
     """
     formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
     for fmt in formats:
@@ -76,19 +79,22 @@ def available_closest_lexeme_dumpfile(
 
     Parameters
     ----------
-        target_entity : str
-            The target date for which the dump is requested (format: YYYY/MM/DD or similar).
+    target_entity : str
+        The target date for which the dump is requested (format: YYYY/MM/DD or similar).
 
-        other_old_dumps : list
-            List of available dump folders as strings.
+    other_old_dumps : list
+        List of available dump folders as strings.
 
-        check_wd_dump_exists : function
-            A function to validate if the dump file exists.
+    check_wd_dump_exists : function
+        A function to validate if the dump file exists.
 
     Returns
     -------
-        str : The closest available dump file date (as a string).
-        None : If no suitable dump is found.
+    str
+        The closest available dump file date (as a string).
+
+    None
+        If no suitable dump is found.
     """
     target_date = parse_date(target_entity)
     closest_date = None
@@ -122,16 +128,19 @@ def download_wd_lexeme_dump(target_entity: str = "latest-lexemes"):
 
     Parameters
     ----------
-        target_entity : str, optional
-            The target dump to download. Defaults to "latest-lexemes".
+    target_entity : str, optional
+        The target dump to download. Defaults to "latest-lexemes".
 
-            - If "latest-lexemes", downloads the latest dump.
-            - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
+        - If "latest-lexemes", downloads the latest dump.
+        - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
 
     Returns
     -------
-        str : The URL of the requested or closest available dump.
-        None : If no suitable dump is found or the request fails.
+    str
+        The URL of the requested or closest available dump.
+
+    None
+        If no suitable dump is found or the request fails.
     """
     base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"
 
@@ -219,12 +228,12 @@ def wd_lexeme_dump_download_wrapper(
 
     Parameters
     ----------
-        wikidata_dump : str
-            Optional date string in YYYYMMDD format for specific dumps.
+    wikidata_dump : str
+        Optional date string in YYYYMMDD format for specific dumps.
 
-        output_dir : str
-            Optional directory path for the downloaded file.
-            Defaults to 'scribe_data_wikidata_dumps_export' directory.
+    output_dir : str
+        Optional directory path for the downloaded file.
+        Defaults to 'scribe_data_wikidata_dumps_export' directory.
     """
     dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes")
 
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index a7c92afb..3e2f3897 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -24,8 +24,8 @@
 from pathlib import Path
 from typing import List, Union
 
-from rich import print as rprint
 import questionary
+from rich import print as rprint
 
 from scribe_data.cli.convert import convert_wrapper
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
@@ -56,39 +56,39 @@ def get_data(
 
     Parameters
     ----------
-        language : str
-            The language(s) to get.
+    language : str
+        The language(s) to get.
 
-        data_type : str
-            The data type(s) to get.
+    data_type : str
+        The data type(s) to get.
 
-        output_type : str
-            The output file type.
+    output_type : str
+        The output file type.
 
-        output_dir : str
-            The output directory path for results.
+    output_dir : str
+        The output directory path for results.
 
-        outputs_per_entry : str
-            How many outputs should be generated per data entry.
+    outputs_per_entry : str
+        How many outputs should be generated per data entry.
 
-        overwrite : bool (default: False)
-            Whether to overwrite existing files.
+    overwrite : bool (default: False)
+        Whether to overwrite existing files.
 
-        all_bool : bool
-            Get all languages and data types.
+    all_bool : bool
+        Get all languages and data types.
 
-        interactive : bool (default: False)
-            Whether it's running in interactive mode.
+    interactive : bool (default: False)
+        Whether it's running in interactive mode.
 
-        identifier_case : str
-            The case format for identifiers. Default is "camel".
+    identifier_case : str
+        The case format for identifiers. Default is "camel".
 
-        wikidata_dump : str
-            The local Wikidata dump that can be used to process data.
+    wikidata_dump : str
+        The local Wikidata lexeme dump that can be used to process data.
 
     Returns
     -------
-        The requested data saved locally given file type and location arguments.
+    The requested data saved locally given file type and location arguments.
     """
     # MARK: Defaults
 
@@ -159,7 +159,7 @@ def prompt_user_download_all():
         else:
             print("Updating all languages and data types...")
             rprint(
-                "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
+                "[bold red]Note that the download all functionality must use Wikidata lexeme dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
             )
             parse_wd_lexeme_dump(
                 language="all",
@@ -187,7 +187,7 @@ def prompt_user_download_all():
         )
         return
 
-    # MARK: Query Data using Wikidata Dump
+    # MARK: Form Dump
 
     elif wikidata_dump:
         parse_wd_lexeme_dump(
diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index f48b952f..5e5dec74 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -27,7 +27,6 @@
 import questionary
 from prompt_toolkit import prompt
 from prompt_toolkit.completion import WordCompleter
-from questionary import Choice
 from rich import print as rprint
 from rich.console import Console
 from rich.logging import RichHandler
@@ -37,14 +36,14 @@
 # from scribe_data.cli.list import list_wrapper
 from scribe_data.cli.get import get_data
 from scribe_data.cli.total import total_wrapper
-from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 from scribe_data.utils import (
-    DEFAULT_JSON_EXPORT_DIR,
     DEFAULT_DUMP_EXPORT_DIR,
+    DEFAULT_JSON_EXPORT_DIR,
     data_type_metadata,
     language_metadata,
     list_all_languages,
 )
+from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 
 # MARK: Config Setup
 
@@ -261,10 +260,12 @@ def request_total_lexeme_loop():
         choice = questionary.select(
             "What would you like to do?",
             choices=[
-                Choice("Configure total lexemes request", "total"),
-                Choice("Run total lexemes request", "run"),
-                Choice("Run total lexemes request with lexeme dumps", "run_all"),
-                Choice("Exit", "exit"),
+                questionary.Choice("Configure total lexemes request", "total"),
+                questionary.Choice("Run total lexemes request", "run"),
+                questionary.Choice(
+                    "Run total lexemes request with lexeme dumps", "run_all"
+                ),
+                questionary.Choice("Exit", "exit"),
             ],
         ).ask()
 
@@ -303,7 +304,7 @@ def request_total_lexeme_loop():
 #     See list of languages.
 #     """
 
-#     choice = questionary.select(
+#     choice = select(
 #         "What would you like to list?",
 #         choices=[
 #             Choice("All languages", "all_languages"),
@@ -327,42 +328,46 @@ def start_interactive_mode(operation: str = None):
 
     Parameters
     ----------
-        operation : str
-            The type of operation that interactive mode is being ran with.
+    operation : str
+        The type of operation that interactive mode is being ran with.
     """
     while True:
         # Check if both selected_languages and selected_data_types are empty.
         if not config.selected_languages and not config.selected_data_types:
             if operation == "get":
                 choices = [
-                    Choice("Configure get data request", "configure"),
+                    questionary.Choice("Configure get data request", "configure"),
                     # Choice("See list of languages", "languages"),
-                    Choice("Exit", "exit"),
+                    questionary.Choice("Exit", "exit"),
                 ]
 
             elif operation == "total":
                 choices = [
-                    Choice("Configure total lexemes request", "total"),
+                    questionary.Choice("Configure total lexemes request", "total"),
                     # Choice("See list of languages", "languages"),
-                    Choice("Exit", "exit"),
+                    questionary.Choice("Exit", "exit"),
                 ]
             elif operation == "translations":
                 choices = [
-                    Choice("Configure translations request", "translations"),
+                    questionary.Choice(
+                        "Configure translations request", "translations"
+                    ),
                     # Choice("See list of languages", "languages"),
-                    Choice("Exit", "exit"),
+                    questionary.Choice("Exit", "exit"),
                 ]
 
         else:
             choices = [
-                Choice("Configure get data request", "configure"),
-                Choice("Exit", "exit"),
+                questionary.Choice("Configure get data request", "configure"),
+                questionary.Choice("Exit", "exit"),
             ]
             if config.configured:
-                choices.insert(1, Choice("Request for get data", "run"))
+                choices.insert(1, questionary.Choice("Request for get data", "run"))
 
             else:
-                choices.insert(1, Choice("Request for total lexeme", "total"))
+                choices.insert(
+                    1, questionary.Choice("Request for total lexeme", "total")
+                )
 
         choice = questionary.select("What would you like to do?", choices=choices).ask()
 
diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index a2aebfce..72175879 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -70,8 +70,8 @@ def list_data_types(language: str = None) -> None:
 
     Parameters
     ----------
-        language : str
-            The language to potentially list data types for.
+    language : str
+        The language to potentially list data types for.
     """
     languages = list_all_languages(language_metadata)
     if language:
@@ -142,8 +142,8 @@ def list_languages_for_data_type(data_type: str) -> None:
 
     Parameters
     ----------
-        data_type : str
-            The data type to check for.
+    data_type : str
+        The data type to check for.
     """
     data_type = correct_data_type(data_type=data_type)
     all_languages = list_languages_with_metadata_for_data_type(language_metadata)
@@ -179,14 +179,14 @@ def list_wrapper(
 
     Parameters
     ----------
-        language : str
-            The language to potentially list data types for.
+    language : str
+        The language to potentially list data types for.
 
-        data_type : str
-            The data type to check for.
+    data_type : str
+        The data type to check for.
 
-        all_bool : boolean
-            Whether all languages and data types should be listed.
+    all_bool : boolean
+        Whether all languages and data types should be listed.
     """
     if (not language and not data_type) or all_bool:
         list_all()
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 8baa7873..e22f4aea 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -24,8 +24,8 @@
 import argparse
 from pathlib import Path
 
-from rich import print as rprint
 from questionary import select
+from rich import print as rprint
 
 from scribe_data.cli.cli_utils import validate_language_and_data_type
 from scribe_data.cli.convert import convert_wrapper
@@ -36,7 +36,6 @@
 from scribe_data.cli.total import total_wrapper
 from scribe_data.cli.upgrade import upgrade_cli
 from scribe_data.cli.version import get_version_message
-
 from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations
 
 LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
@@ -291,8 +290,8 @@ def main() -> None:
     download_parser = subparsers.add_parser(
         "download",
         aliases=["d"],
-        help="Download Wikidata dumps.",
-        description="Download Wikidata dumps from dumps.wikimedia.org.",
+        help="Download Wikidata lexeme dumps.",
+        description="Download Wikidata lexeme dumps from dumps.wikimedia.org.",
         epilog=CLI_EPILOG,
         formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60),
     )
@@ -302,7 +301,7 @@ def main() -> None:
         "--wikidata-dump-version",
         nargs="?",
         const="latest",
-        help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.",
+        help="Download Wikidata lexeme dump. Optionally specify date in YYYYMMDD format.",
     )
     download_parser.add_argument(
         "-wdp",
@@ -428,7 +427,7 @@ def main() -> None:
             action = select(
                 "What would you like to do?",
                 choices=[
-                    "Download a Wikidata dump",
+                    "Download a Wikidata lexemes dump",
                     "Check for totals",
                     "Get data",
                     "Get translations",
@@ -436,16 +435,21 @@ def main() -> None:
                 ],
             ).ask()
 
-            if action == "Download a Wikidata dump":
+            if action == "Download a Wikidata lexemes dump":
                 wd_lexeme_dump_download_wrapper()
+
             elif action == "Check for totals":
                 start_interactive_mode(operation="total")
+
             elif action == "Get data":
                 start_interactive_mode(operation="get")
+
             elif action == "Get translations":
                 start_interactive_mode(operation="translations")
+
             else:
                 print("Skipping action")
+
         else:
             parser.print_help()
 
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index e543256e..8d86d7fe 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -35,8 +35,7 @@
     language_to_qid,
     list_all_languages,
 )
-from scribe_data.wikidata.wikidata_utils import sparql
-from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
+from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql
 
 
 def get_qid_by_input(input_str):
@@ -45,13 +44,13 @@ def get_qid_by_input(input_str):
 
     Parameters
     ----------
-        input_str : str
-            The input string representing a language or data type.
+    input_str : str
+        The input string representing a language or data type.
 
     Returns
     -------
-        str or None
-            The QID corresponding to the input string, or- None if not found.
+    str or None
+        The QID corresponding to the input string, or- None if not found.
     """
     if input_str:
         if input_str in language_to_qid:
@@ -69,13 +68,13 @@ def get_datatype_list(language):
 
     Parameters
     ----------
-        language : str
-            The language to return data types for.
+    language : str
+        The language to return data types for.
 
     Returns
     -------
-        data_types : list[str] or None
-            A list of the corresponding data types.
+    data_types : list[str] or None
+        A list of the corresponding data types.
     """
     language_key = language.strip().lower()  # normalize input
     languages = list_all_languages(language_metadata)
@@ -129,18 +128,18 @@ def check_qid_is_language(qid: str):
     """
     Parameters
     ----------
-        qid : str
-            The QID to check Wikidata to see if it's a language and return its English label.
+    qid : str
+        The QID to check Wikidata to see if it's a language and return its English label.
 
     Outputs
     -------
-        str
-            The English label of the Wikidata language entity.
+    str
+        The English label of the Wikidata language entity.
 
     Raises
     ------
-        ValueError
-            An invalid QID that's not a language has been passed.
+    ValueError
+        An invalid QID that's not a language has been passed.
     """
     api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
     request_string = f"{api_endpoint}/entities/items/{qid}"
@@ -167,13 +166,13 @@ def print_total_lexemes(language: str = None):
 
     Parameters
     ----------
-        language : str (Default=None)
-            The language to display data type entity counts for.
+    language : str (Default=None)
+        The language to display data type entity counts for.
 
     Outputs
     -------
-        str
-            A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found.
+    str
+        A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found.
     """
     if language is None:
         print("Returning total counts for all languages and data types...\n")
@@ -379,24 +378,24 @@ def total_wrapper(
 
     Parameters
     ----------
-        language : Union[str, List[str]]
-            The language(s) to potentially total data types for.
+    language : Union[str, List[str]]
+        The language(s) to potentially total data types for.
 
-        data_type : Union[str, List[str]]
-            The data type(s) to check for.
+    data_type : Union[str, List[str]]
+        The data type(s) to check for.
 
-        all_bool : boolean
-            Whether all languages and data types should be listed.
+    all_bool : boolean
+        Whether all languages and data types should be listed.
 
-        wikidata_dump : Union[str, bool]
-            The local Wikidata dump path that can be used to process data.
-            If True, indicates the flag was used without a path.
+    wikidata_dump : Union[str, bool]
+        The local Wikidata lexeme dump path that can be used to process data.
+        If True, indicates the flag was used without a path.
     """
     # Handle --all flag
     if all_bool and wikidata_dump:
         language = "all"
 
-    if wikidata_dump is True:  # flag without a wikidata dump path
+    if wikidata_dump is True:  # flag without a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["total"],
@@ -404,7 +403,7 @@ def total_wrapper(
         )
         return
 
-    if isinstance(wikidata_dump, str):  # if user provided a wikidata dump path
+    if isinstance(wikidata_dump, str):  # if user provided a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["total"],
diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py
index 2661f48d..1d33b158 100644
--- a/src/scribe_data/unicode/generate_emoji_keywords.py
+++ b/src/scribe_data/unicode/generate_emoji_keywords.py
@@ -44,16 +44,17 @@ def generate_emoji(language, output_dir: str = None):
 
     Parameters
     ----------
-        language : str
-            The ISO code of the language for which to generate emoji keywords.
+    language : str
+        The ISO code of the language for which to generate emoji keywords.
 
-        output_dir : str, optional
-            The directory where the generated data will be saved.
-            If not specified, the data will be saved in a default directory.
+    output_dir : str, optional
+        The directory where the generated data will be saved.
+        If not specified, the data will be saved in a default directory.
 
     Returns
     -------
-        None: The function does not return any value but outputs data to the specified directory.
+    None
+        The function does not return any value but outputs data to the specified directory.
     """
     if check_and_install_pyicu() and check_if_pyicu_installed() is False:
         print("Thank you.")
diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
index abdf2363..eb373862 100644
--- a/src/scribe_data/unicode/process_unicode.py
+++ b/src/scribe_data/unicode/process_unicode.py
@@ -57,15 +57,15 @@ def gen_emoji_lexicon(
 
     Parameters
     ----------
-        language : string (default=None)
-            The language keywords are being generated for.
+    language : string (default=None)
+        The language keywords are being generated for.
 
-        emojis_per_keyword : int (default=None)
-            The limit for number of emoji keywords that should be generated per keyword.
+    emojis_per_keyword : int (default=None)
+        The limit for number of emoji keywords that should be generated per keyword.
 
     Returns
     -------
-        Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps.
+    Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps.
     """
     if not icu_installed:
         raise ImportError("Could not import required PyICU functionality.")
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 163af4ae..311478bc 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -22,17 +22,17 @@
 """
 
 import ast
+import contextlib
 import json
 import os
 import re
-import questionary
 from datetime import datetime
 from importlib import resources
 from pathlib import Path
 from typing import Any, Optional
 
+import questionary
 from rich import print as rprint
-from questionary import select
 
 # MARK: Utils Variables
 
@@ -86,7 +86,6 @@
 for lang, lang_data in language_metadata.items():
     if "sub_languages" in lang_data:
         for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
-            sub_lang_lower = sub_lang
             sub_qid = sub_lang_data.get("qid")
 
             if sub_qid is None:
@@ -95,8 +94,8 @@
                 )
 
             else:
-                language_map[sub_lang_lower] = sub_lang_data
-                language_to_qid[sub_lang_lower] = sub_qid
+                language_map[sub_lang] = sub_lang_data
+                language_to_qid[sub_lang] = sub_qid
 
     else:
         qid = lang_data.get("qid")
@@ -114,15 +113,15 @@ def _load_json(package_path: str, file_name: str) -> Any:
 
     Parameters
     ----------
-        package_path : str
-            The fully qualified package that contains the resource.
+    package_path : str
+        The fully qualified package that contains the resource.
 
-        file_name : str
-            The name of the file (resource) that contains the JSON data.
+    file_name : str
+        The name of the file (resource) that contains the JSON data.
 
     Returns
     -------
-        A python entity representing the JSON content.
+    A python entity representing the JSON content.
     """
     with resources.files(package_path).joinpath(file_name).open(
         encoding="utf-8"
@@ -143,25 +142,26 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
 
     Parameters
     ----------
-        source_value : str
-            The source value to find equivalents for (e.g., 'english', 'nynorsk').
+    source_value : str
+        The source value to find equivalents for (e.g., 'english', 'nynorsk').
 
-        source_key : str
-            The source key to reference (e.g., 'language').
+    source_key : str
+        The source key to reference (e.g., 'language').
 
-        target_key : str
-            The key to target (e.g., 'qid').
+    target_key : str
+        The key to target (e.g., 'qid').
 
-        error_msg : str
-            The message displayed when a value cannot be found.
+    error_msg : str
+        The message displayed when a value cannot be found.
 
     Returns
     -------
-        The 'target' value given the passed arguments.
+    The 'target' value given the passed arguments.
 
     Raises
     ------
-        ValueError : when a source_value is not supported or the language only has sub-languages.
+    ValueError
+        When a source_value is not supported or the language only has sub-languages.
     """
     # Check if we're searching by language name.
     if source_key == "language":
@@ -195,13 +195,13 @@ def get_language_qid(language: str) -> str:
 
     Parameters
     ----------
-        language : str
-            The language the QID should be returned for.
+    language : str
+        The language the QID should be returned for.
 
     Returns
     -------
-        str
-            The Wikidata QID for the language.
+    str
+        The Wikidata QID for the language.
     """
     return _find(
         source_key="language",
@@ -217,13 +217,13 @@ def get_language_iso(language: str) -> str:
 
     Parameters
     ----------
-        language : str
-            The language the ISO should be returned for.
+    language : str
+        The language the ISO should be returned for.
 
     Returns
     -------
-        str
-            The ISO code for the language.
+    str
+        The ISO code for the language.
     """
 
     return _find(
@@ -240,13 +240,13 @@ def get_language_from_iso(iso: str) -> str:
 
     Parameters
     ----------
-        iso : str
-            The ISO the language name should be returned for.
+    iso : str
+        The ISO the language name should be returned for.
 
     Returns
     -------
-        str
-            The name for the language which has an ISO value of iso.
+    str
+        The name for the language which has an ISO value of iso.
     """
     # Iterate over the languages and their properties.
     for language, properties in _languages.items():
@@ -272,19 +272,19 @@ def load_queried_data(
 
     Parameters
     ----------
-        dir_path : str
-            The path to the directory containing the queried data.
+    dir_path : str
+        The path to the directory containing the queried data.
 
-        language : str
-            The language for which the data is being loaded.
+    language : str
+        The language for which the data is being loaded.
 
-        data_type : str
-            The type of data being loaded (e.g. 'nouns', 'verbs').
+    data_type : str
+        The type of data being loaded (e.g. 'nouns', 'verbs').
 
     Returns
     -------
-        tuple(Any, str)
-            A tuple containing the loaded data and the path to the data file.
+    tuple(Any, str)
+        A tuple containing the loaded data and the path to the data file.
     """
     data_path = (
         Path(dir_path) / language.lower().replace(" ", "_") / f"{data_type}.json"
@@ -300,18 +300,18 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None:
 
     Parameters
     ----------
-        dir_path : str
-            The path to the directory containing the queried data.
+    dir_path : str
+        The path to the directory containing the queried data.
 
-        language : str
-            The language for which the data is being loaded.
+    language : str
+        The language for which the data is being loaded.
 
-        data_type : str
-            The type of data being loaded (e.g. 'nouns', 'verbs').
+    data_type : str
+        The type of data being loaded (e.g. 'nouns', 'verbs').
 
     Returns
     -------
-        None : The file is deleted.
+    None : The file is deleted.
     """
     data_path = (
         Path(dir_path)
@@ -319,12 +319,9 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None:
         / f"{data_type}_queried.json"
     )
 
-    try:
+    with contextlib.suppress(OSError):
         os.remove(data_path)
 
-    except OSError:
-        pass
-
 
 def export_formatted_data(
     dir_path: str,
@@ -338,21 +335,21 @@ def export_formatted_data(
 
     Parameters
     ----------
-        dir_path : str
-            The path to the directory containing the queried data.
+    dir_path : str
+        The path to the directory containing the queried data.
 
-        formatted_data : dict
-            The data to be exported.
+    formatted_data : dict
+        The data to be exported.
 
-        language : str
-            The language for which the data is being exported.
+    language : str
+        The language for which the data is being exported.
 
-        data_type : str
-            The type of data being exported (e.g. 'nouns', 'verbs').
+    data_type : str
+        The type of data being exported (e.g. 'nouns', 'verbs').
 
     Returns
     -------
-        None
+    None
     """
     export_path = (
         Path(dir_path)
@@ -375,13 +372,13 @@ def get_ios_data_path(language: str) -> str:
 
     Parameters
     ----------
-        language : str
-            The language the path should be returned for.
+    language : str
+        The language the path should be returned for.
 
     Returns
     -------
-        str
-            The path to the language folder for the given language.
+    str
+        The path to the language folder for the given language.
     """
     return Path("Scribe-iOS") / "Keyboards" / "LanguageKeyboards" / f"{language}"
 
@@ -392,13 +389,13 @@ def get_android_data_path() -> str:
 
     Parameters
     ----------
-        language : str
-            The language the path should be returned for.
+    language : str
+        The language the path should be returned for.
 
     Returns
     -------
-        str
-            The path to the assets data folder for the application.
+    str
+        The path to the assets data folder for the application.
     """
     return Path("Scribe-Android") / "app" / "src" / "main" / "assets" / "data"
 
@@ -411,19 +408,19 @@ def check_command_line_args(
 
     Parameters
     ----------
-        file_name : str
-            The name of the file for clear error outputs if necessary.
+    file_name : str
+        The name of the file for clear error outputs if necessary.
 
-        passed_values : UNKNOWN (will be checked)
-            An argument to be checked against known values.
+    passed_values : UNKNOWN (will be checked)
+        An argument to be checked against known values.
 
-        values_to_check : list(str)
-            The values that should be checked against.
+    values_to_check : list(str)
+        The values that should be checked against.
 
     Returns
     -------
-        args: list(str)
-            The arguments or an error are returned depending on if they're correct.
+    args: list(str)
+        The arguments or an error are returned depending on if they're correct.
     """
     try:
         args = ast.literal_eval(passed_values)
@@ -466,19 +463,19 @@ def check_and_return_command_line_args(
 
     Parameters
     ----------
-        all_args : list[str]
-            The arguments passed to the Scribe-Data file.
+    all_args : list[str]
+        The arguments passed to the Scribe-Data file.
 
-        first_args_check : list[str]
-            The values that the first argument should be checked against.
+    first_args_check : list[str]
+        The values that the first argument should be checked against.
 
-        second_args_check : list[str]
-            The values that the second argument should be checked against.
+    second_args_check : list[str]
+        The values that the second argument should be checked against.
 
     Returns
     -------
-        first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]]
-            The subset of possible first and second arguments that have been verified as being valid.
+    first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]]
+        The subset of possible first and second arguments that have been verified as being valid.
     """
     if len(all_args) == 1:
         return None, None
@@ -523,29 +520,30 @@ def format_sublanguage_name(lang, language_metadata=_languages):
 
     Parameters
     ----------
-        lang : str
-            The name of the language or sub-language to format.
+    lang : str
+        The name of the language or sub-language to format.
 
-        language_metadata : dict
-            The metadata containing information about main languages and their sub-languages.
+    language_metadata : dict
+        The metadata containing information about main languages and their sub-languages.
 
     Returns
     -------
-        str
-            The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian').
-            Otherwise the original name.
+    str
+        The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian').
+        Otherwise the original name.
 
     Raises
     ------
-        ValueError: If the provided language or sub-language is not found.
+    ValueError
+        If the provided language or sub-language is not found.
 
-    Example
-    -------
-        > format_sublanguage_name("nynorsk", language_metadata)
-        'Nynorsk Norwegian'
+    Examples
+    --------
+    > format_sublanguage_name("nynorsk", language_metadata)
+    'Nynorsk Norwegian'
 
-        > format_sublanguage_name("english", language_metadata)
-        'English'
+    > format_sublanguage_name("english", language_metadata)
+    'English'
     """
     for main_lang, lang_data in language_metadata.items():
         # If it's not a sub-language, return the original name.
@@ -598,14 +596,15 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages):
         # Check if there are sub-languages.
         if "sub_languages" in lang_data:
             # Add the sub-languages to current_languages with metadata.
-            for sub_key, sub_data in lang_data["sub_languages"].items():
-                current_languages.append(
-                    {
-                        "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}",
-                        "iso": sub_data.get("iso", ""),
-                        "qid": sub_data.get("qid", ""),
-                    }
-                )
+            current_languages.extend(
+                {
+                    "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}",
+                    "iso": sub_data.get("iso", ""),
+                    "qid": sub_data.get("qid", ""),
+                }
+                for sub_key, sub_data in lang_data["sub_languages"].items()
+            )
+
         else:
             # If no sub-languages, add the main language with metadata.
             current_languages.append(
@@ -638,12 +637,12 @@ def check_lexeme_dump_prompt_download(output_dir: str):
 
     Parameters
     ----------
-        output_dir : str
-            The directory to check for the existence of a Wikidata lexeme dump.
+    output_dir : str
+        The directory to check for the existence of a Wikidata lexeme dump.
 
     Returns
     -------
-        None : The user is prompted to download a new Wikidata dump after the existence of one is checked.
+    None : The user is prompted to download a new Wikidata lexeme dump after the existence of one is checked.
     """
     existing_dumps = list(Path(output_dir).glob("*.json.bz2"))
     if existing_dumps:
@@ -651,7 +650,7 @@ def check_lexeme_dump_prompt_download(output_dir: str):
         for dump in existing_dumps:
             rprint(f"  - {Path(output_dir)}/{dump.name}")
 
-        user_input = select(
+        user_input = questionary.select(
             "Do you want to:",
             choices=[
                 "Delete existing dumps",
@@ -661,17 +660,17 @@ def check_lexeme_dump_prompt_download(output_dir: str):
             ],
         ).ask()
 
-        if user_input.startswith("Delete"):
+        if user_input == "Delete existing dumps":
             for dump in existing_dumps:
                 dump.unlink()
 
             rprint("[bold green]Existing dumps deleted.[/bold green]")
-            download_input = select(
+            download_input = questionary.select(
                 "Do you want to download the latest lexeme dump?", choices=["Yes", "No"]
             ).ask()
             return download_input != "Yes"
 
-        elif user_input.startswith("Use"):
+        elif user_input == "Use existing latest dump":
             # Check for the latest dump file.
             latest_dump = None
             if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
@@ -712,9 +711,13 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
     Returns True if user chooses to skip (i.e., we do NOT proceed).
     Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed).
 
-    Parameters:
-        index_path: Path to check
-        overwrite_all: If True, automatically overwrite without prompting
+    Parameters
+    ----------
+    index_path : pathlib.Path
+        The path to check.
+
+    overwrite_all : cool (default=False)
+        If True, automatically overwrite without prompting.
     """
     if index_path.exists():
         if overwrite_all:
@@ -727,6 +730,7 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
             default="Skip process",
         ).ask()
 
-        # If user selects "Skip process", return True meaning "don't proceed"
+        # If user selects "Skip process", return True meaning "don't proceed".
         return choice == "Skip process"
+
     return False
diff --git a/src/scribe_data/wikidata/check_query/check.py b/src/scribe_data/wikidata/check_query/check.py
index 41f1706a..955168b5 100644
--- a/src/scribe_data/wikidata/check_query/check.py
+++ b/src/scribe_data/wikidata/check_query/check.py
@@ -49,15 +49,15 @@ def ping(url: str, timeout: int) -> bool:
 
     Parameters
     ----------
-        url : str
-            The URL to test.
+    url : str
+        The URL to test.
 
-        timeout : int
-            The maximum number of seconds to wait for a reply.
+    timeout : int
+        The maximum number of seconds to wait for a reply.
 
     Returns
     -------
-        bool : True if connectivity is established or False otherwise.
+    bool : True if connectivity is established or False otherwise.
     """
     try:
         with urllib.request.urlopen(url, timeout=timeout) as response:
@@ -132,12 +132,12 @@ def check_sparql_file(fpath: str) -> Path:
 
     Parameters
     ----------
-        fpath : str
-            The file to validate.
+    fpath : str
+        The file to validate.
 
     Returns
     -------
-        Path : the validated file.
+    Path : the validated file.
     """
     path = Path(fpath)
 
@@ -156,19 +156,20 @@ def check_positive_int(value: str, err_msg: str) -> int:
 
     Parameters
     ----------
-        value : str
-            The value to be validated.
+    value : str
+        The value to be validated.
 
-        err_msg : str
-            Used when value fails validation.
+    err_msg : str
+        Used when value fails validation.
 
     Returns
     -------
-        int : the validated number.
+    int
+        The validated number.
 
     Raises
     ------
-        argparse.ArgumentTypeError
+    argparse.ArgumentTypeError
     """
     with contextlib.suppress(ValueError):
         number = int(value)
@@ -184,16 +185,17 @@ def check_limit(limit: str) -> int:
 
     Parameters
     ----------
-        limit : str
-            The LIMIT to be validated.
+    limit : str
+        The LIMIT to be validated.
 
     Returns
     -------
-        int : the validated LIMIT.
+    int
+        The validated LIMIT.
 
     Raises
     ------
-        argparse.ArgumentTypeError
+    argparse.ArgumentTypeError
     """
     return check_positive_int(limit, "LIMIT must be an integer of value 1 or greater.")
 
@@ -204,16 +206,17 @@ def check_timeout(timeout: str) -> int:
 
     Parameters
     ----------
-        timeout : str
-            The timeout to be validated.
+    timeout : str
+        The timeout to be validated.
 
     Returns
     -------
-        int : the validated timeout.
+    int
+        The validated timeout.
 
     Raises
     ------
-        argparse.ArgumentTypeError
+    argparse.ArgumentTypeError
     """
     return check_positive_int(
         timeout, "timeout must be an integer of value 1 or greater."
@@ -226,12 +229,13 @@ def main(argv=None) -> int:
 
     Parameters
     ----------
-        argv (default=None)
-            If set to None then argparse will use sys.argv as the arguments.
+    argv (default=None)
+        If set to None then argparse will use sys.argv as the arguments.
 
     Returns
     --------
-        int : the exit status - 0 - success; any other value - failure.
+    int
+        The exit status - 0 - success; any other value - failure.
     """
     cli = argparse.ArgumentParser(
         description=f"run SPARQL queries from the '{PROJECT_ROOT}' project",
@@ -356,7 +360,8 @@ def error_report(failures: list[QueryExecutionException]) -> None:
 
     Parameters
     ----------
-        failures (list[QueryExecutionException]) : failed queries.
+    failures : list[QueryExecutionException]
+        Failed queries.
     """
     if not failures:
         return
@@ -373,11 +378,11 @@ def success_report(successes: list[tuple[QueryFile, dict]], display: bool) -> No
 
     Parameters
     ----------
-        successes : list[tuple[QueryFile, dict]]
-            Successful queries.
+    successes : list[tuple[QueryFile, dict]]
+        Successful queries.
 
-        display : bool
-            Whether there should be an output or not.
+    display : bool
+        Whether there should be an output or not.
     """
     if not (display and successes):
         return
diff --git a/src/scribe_data/wikidata/check_query/sparql.py b/src/scribe_data/wikidata/check_query/sparql.py
index f702907f..b3c43d27 100644
--- a/src/scribe_data/wikidata/check_query/sparql.py
+++ b/src/scribe_data/wikidata/check_query/sparql.py
@@ -39,12 +39,13 @@ def sparql_context(url: str) -> SPARQL.SPARQLWrapper:
 
     Parameters
     ----------
-        url : str
-            A valid URL of a SPARQL endpoint.
+    url : str
+        A valid URL of a SPARQL endpoint.
 
     Returns
     -------
-        SPARQLWrapper : the context.
+    SPARQLWrapper
+        The context.
     """
     context = SPARQL.SPARQLWrapper(url)
     context.setReturnFormat(SPARQL.JSON)
@@ -61,21 +62,22 @@ def execute(
 
     Parameters
     ----------
-        query : QueryFile
-            The SPARQL query to run.
+    query : QueryFile
+        The SPARQL query to run.
 
-        limit : int
-            The maximum number of results a query should return.
+    limit : int
+        The maximum number of results a query should return.
 
-        context : SPARQLWrapper
-            The SPARQL context.
+    context : SPARQLWrapper
+        The SPARQL context.
 
-        tries : int
-            The maximum number of times the query should be executed after failure.
+    tries : int
+        The maximum number of times the query should be executed after failure.
 
     Returns
     -------
-        dict : the results of the query.
+    dict
+        The results of the query.
     """
 
     def delay_in_seconds() -> int:
diff --git a/src/scribe_data/wikidata/format_data.py b/src/scribe_data/wikidata/format_data.py
index 68186dbe..2aa2db97 100644
--- a/src/scribe_data/wikidata/format_data.py
+++ b/src/scribe_data/wikidata/format_data.py
@@ -46,18 +46,18 @@ def format_data(
 
     Parameters
     ----------
-        dir_path : str
-            The output directory path for results.
+    dir_path : str
+        The output directory path for results.
 
-        language : str
-            The language for which the data is being loaded.
+    language : str
+        The language for which the data is being loaded.
 
-        data_type : str
-            The type of data being loaded (e.g. 'nouns', 'verbs').
+    data_type : str
+        The type of data being loaded (e.g. 'nouns', 'verbs').
 
     Returns
     _______
-        A saved and formatted data file for the given language and data type.
+    A saved and formatted data file for the given language and data type.
     """
     data_list, data_path = load_queried_data(
         dir_path=dir_path, language=language, data_type=data_type
diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index e23be51e..bbe7c7b5 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -47,18 +47,18 @@ def execute_formatting_script(output_dir: str, language: str, data_type: str):
 
     Parameters
     ----------
-        output_dir : str
-            The output directory path for results.
+    output_dir : str
+        The output directory path for results.
 
-        language : str
-            The language for which the data is being loaded.
+    language : str
+        The language for which the data is being loaded.
 
-        data_type : str
-            The type of data being loaded (e.g. 'nouns', 'verbs').
+    data_type : str
+        The type of data being loaded (e.g. 'nouns', 'verbs').
 
     Returns
     -------
-        The results of the formatting script saved in the given output directory.
+    The results of the formatting script saved in the given output directory.
     """
     formatting_file_path = Path(__file__).parent / "format_data.py"
 
@@ -108,21 +108,21 @@ def query_data(
 
     Parameters
     ----------
-        language : str
-            The language(s) to get.
+    language : str
+        The language(s) to get.
 
-        data_type : str
-            The data type(s) to get.
+    data_type : str
+        The data type(s) to get.
 
-        output_dir : str
-            The output directory path for results.
+    output_dir : str
+        The output directory path for results.
 
-        overwrite : bool (default: False)
-            Whether to overwrite existing files.
+    overwrite : bool (default: False)
+        Whether to overwrite existing files.
 
     Returns
     -------
-        Formatted data from Wikidata saved in the output directory.
+    Formatted data from Wikidata saved in the output directory.
     """
     current_languages = list_all_languages(language_metadata)
     current_data_type = ["nouns", "verbs", "prepositions"]
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 14e5fc02..29182070 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -21,14 +21,15 @@
 """
 
 from pathlib import Path
-from rich import print as rprint
-from SPARQLWrapper import JSON, POST, SPARQLWrapper
 from typing import List, Union
+
 import requests
+from rich import print as rprint
+from SPARQLWrapper import JSON, POST, SPARQLWrapper
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
+from scribe_data.utils import data_type_metadata, language_metadata
 from scribe_data.wiktionary.parse_dump import parse_dump
-from scribe_data.utils import language_metadata, data_type_metadata
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
@@ -65,20 +66,24 @@ def parse_wd_lexeme_dump(
     wikidata_dump_path: str = None,
 ):
     """
-    Checks for the existence of a Wikidata dump and parses it if possible.
+    Checks for the existence of a Wikidata lexeme dump and parses it if possible.
 
     Parameters
     ----------
     language : Union[str, List[str]]
         The language(s) to parse the data for. Use "all" for all languages.
+
     wikidata_dump_type : List[str]
-        The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]).
+        The type(s) of Wikidata lexeme dump to parse (e.g. ["total", "translations", "form"]).
+
     data_types : List[str]
         The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]).
+
     type_output_dir : str, optional
         The directory to save the parsed JSON data. If None, uses default directory.
+
     wikidata_dump_path : str, optional
-        The local Wikidata dump directory that should be used to get data.
+        The local Wikidata lexeme dump directory that should be used to get data.
     """
     # Convert "all" to list of all languages
     if isinstance(language, str) and language.lower() == "all":
diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py
index 37482bee..c4b8b450 100644
--- a/src/scribe_data/wikipedia/extract_wiki.py
+++ b/src/scribe_data/wikipedia/extract_wiki.py
@@ -47,24 +47,24 @@ def download_wiki(language="en", target_dir="wiki_dump", file_limit=None, dump_i
 
     Parameters
     ----------
-        language : str (default=en)
-            The language of Wikipedia to download.
+    language : str (default=en)
+        The language of Wikipedia to download.
 
-        target_dir : pathlib.Path (default=wiki_dump)
-            The directory in the pwd into which files should be downloaded.
+    target_dir : pathlib.Path (default=wiki_dump)
+        The directory in the pwd into which files should be downloaded.
 
-        file_limit : int (default=None, all files)
-            The limit for the number of files to download.
+    file_limit : int (default=None, all files)
+        The limit for the number of files to download.
 
-        dump_id : str (default=None)
-            The id of an explicit Wikipedia dump that the user wants to download.
+    dump_id : str (default=None)
+        The id of an explicit Wikipedia dump that the user wants to download.
 
-            Note: a value of None will select the third from the last (latest stable dump).
+        Note: a value of None will select the third from the last (latest stable dump).
 
     Returns
     -------
-        file_info : list of lists
-            Information on the downloaded Wikipedia dump files.
+    file_info : list of lists
+        Information on the downloaded Wikipedia dump files.
     """
     if file_limit is not None:
         assert isinstance(
@@ -148,16 +148,16 @@ def _process_article(title, text):
 
     Parameters
     ----------
-        title : str
-            The title of the article.
+    title : str
+        The title of the article.
 
-        text : str
-            The text to be processed.
+    text : str
+        The text to be processed.
 
     Returns
     -------
-        title, text:  string, string
-            The data from the article.
+    title, text:  string, string
+        The data from the article.
     """
     wikicode = mwparserfromhell.parse(text)
 
@@ -173,24 +173,24 @@ def iterate_and_parse_file(args):
 
     Parameters
     ----------
-        args : tuple
-            The below arguments as a tuple for pool.imap_unordered rather than pool.starmap.
+    args : tuple
+        The below arguments as a tuple for pool.imap_unordered rather than pool.starmap.
 
-        input_path : pathlib.Path
-            The path to the data file.
+    input_path : pathlib.Path
+        The path to the data file.
 
-        partitions_dir : pathlib.Path
-            The path to where output file should be stored.
+    partitions_dir : pathlib.Path
+        The path to where output file should be stored.
 
-        article_limit : int (default=None)
-            An optional article_limit of the number of articles to find.
+    article_limit : int (default=None)
+        An optional article_limit of the number of articles to find.
 
-        verbose : bool (default=True)
-            Whether to show a tqdm progress bar for the processes.
+    verbose : bool (default=True)
+        Whether to show a tqdm progress bar for the processes.
 
     Returns
     -------
-        A parsed file Wikipedia dump file with articles.
+    A parsed file Wikipedia dump file with articles.
     """
     input_path, partitions_dir, article_limit, verbose = args
 
@@ -296,30 +296,30 @@ def parse_to_ndjson(
 
     Parameters
     ----------
-        output_path : str (default=articles)
-            The name of the final output ndjson file.
+    output_path : str (default=articles)
+        The name of the final output ndjson file.
 
-        input_dir : str (default=wikipedia_dump)
-            The path to the directory where the data is stored.
+    input_dir : str (default=wikipedia_dump)
+        The path to the directory where the data is stored.
 
-        partitions_dir : str (default=partitions)
-            The path to the directory where the output should be stored.
+    partitions_dir : str (default=partitions)
+        The path to the directory where the output should be stored.
 
-        article_limit : int (default=None)
-            An optional limit of the number of articles per dump file to find.
+    article_limit : int (default=None)
+        An optional limit of the number of articles per dump file to find.
 
-        delete_parsed_files : bool (default=False)
-            Whether to delete the separate parsed files after combining them.
+    delete_parsed_files : bool (default=False)
+        Whether to delete the separate parsed files after combining them.
 
-        multicore : bool (default=True)
-            Whether to use multicore processing.
+    multicore : bool (default=True)
+        Whether to use multicore processing.
 
-        verbose : bool (default=True)
-            Whether to show a tqdm progress bar for the processes.
+    verbose : bool (default=True)
+        Whether to show a tqdm progress bar for the processes.
 
     Returns
     -------
-        Wikipedia dump files parsed and converted to json files.
+    Wikipedia dump files parsed and converted to json files.
     """
     output_dir = "/".join(list(output_path.split("/")[:-1]))
     if not output_dir.exists():
diff --git a/src/scribe_data/wikipedia/process_wiki.py b/src/scribe_data/wikipedia/process_wiki.py
index bd5bbb16..458e5dda 100644
--- a/src/scribe_data/wikipedia/process_wiki.py
+++ b/src/scribe_data/wikipedia/process_wiki.py
@@ -54,25 +54,25 @@ def clean(
 
     Parameters
     ----------
-        texts : str or list
-            The texts to be cleaned and tokenized.
+    texts : str or list
+        The texts to be cleaned and tokenized.
 
-        language : string (default=en)
-            The language of the texts being cleaned.
+    language : string (default=en)
+        The language of the texts being cleaned.
 
-        remove_words : str or list (default=None)
-            Strings that should be removed from the text body.
+    remove_words : str or list (default=None)
+        Strings that should be removed from the text body.
 
-        sample_size : float (default=1)
-            The amount of data to be randomly sampled.
+    sample_size : float (default=1)
+        The amount of data to be randomly sampled.
 
-        verbose : bool (default=True)
-            Whether to show a tqdm progress bar for the process.
+    verbose : bool (default=True)
+        Whether to show a tqdm progress bar for the process.
 
     Returns
     -------
-        cleaned_texts : list
-            The texts formatted for analysis.
+    cleaned_texts : list
+        The texts formatted for analysis.
     """
     if isinstance(texts, str):
         texts = [texts]
@@ -331,27 +331,27 @@ def gen_autosuggestions(
 
     Parameters
     ----------
-        text_corpus : list
-            The Wikipedia texts formatted for word relation extraction.
+    text_corpus : list
+        The Wikipedia texts formatted for word relation extraction.
 
-        language : string (default=en)
-            The language autosuggestions are being generated for.
+    language : string (default=en)
+        The language autosuggestions are being generated for.
 
-        num_words: int (default=500)
-            The number of words that autosuggestions should be generated for.
+    num_words: int (default=500)
+        The number of words that autosuggestions should be generated for.
 
-        ignore_words : str or list (default=None)
-            Strings that should be removed from the text body.
+    ignore_words : str or list (default=None)
+        Strings that should be removed from the text body.
 
-        update_local_data : bool (default=False)
-            Saves the created dictionaries as JSONs in the target directories.
+    update_local_data : bool (default=False)
+        Saves the created dictionaries as JSONs in the target directories.
 
-        verbose : bool (default=True)
-            Whether to show a tqdm progress bar for the process.
+    verbose : bool (default=True)
+        Whether to show a tqdm progress bar for the process.
 
     Returns
     -------
-        Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps.
+    Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps.
     """
     counter_obj = Counter(chain.from_iterable(text_corpus))
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 36bbbc69..45f00d19 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -22,19 +22,19 @@
 
 import bz2
 import time
-import orjson
-
-from tqdm import tqdm
+from collections import Counter, defaultdict
 from pathlib import Path
-from collections import defaultdict, Counter
-from typing import Union, List
+from typing import List, Union
+
+import orjson
+import questionary
 from scribe_data.utils import (
     DEFAULT_DUMP_EXPORT_DIR,
-    language_metadata,
-    data_type_metadata,
     check_index_exists,
+    data_type_metadata,
+    language_metadata,
 )
-import questionary
+from tqdm import tqdm
 
 
 class LexemeProcessor:
@@ -51,38 +51,38 @@ def __init__(
             - 'total'
         data_types is a list of categories (e.g., ["nouns", "adverbs"]) for forms.
         """
-        # Pre-compute sets for faster lookups
+        # Pre-compute sets for faster lookups.
         self.parse_type = set(parse_type or [])
         self.data_types = set(data_types or [])
         self.target_iso = set(
             [target_iso] if isinstance(target_iso, str) else target_iso or []
         )
 
-        # Pre-compute valid categories and languages
+        # Pre-compute valid categories and languages.
         self._category_lookup = {v: k for k, v in data_type_metadata.items()}
         self.valid_categories = set(data_type_metadata.values())
 
-        # Build optimized language mapping
+        # Build optimized language mapping.
         self.iso_to_name = self._build_iso_mapping()
         self.valid_iso_codes = set(self.iso_to_name.keys())
 
-        # Separate data structures
+        # Separate data structures.
         self.translations_index = defaultdict(
             lambda: defaultdict(lambda: defaultdict(dict))
         )
         self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
-        # Stats
+        # Stats.
         self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
 
-        # For category lookups, invert data_type_metadata
-        # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}
+        # For category lookups, invert data_type_metadata.
+        # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}.
         self._category_lookup = {v: k for k, v in data_type_metadata.items()}
 
-        # Build map from ISO to full language name
+        # Build map from ISO to full language name.
         self.iso_to_name = self._build_iso_mapping()
 
-        # For "total" usage
+        # For "total" usage.
         self.lexical_category_counts = defaultdict(Counter)
         self.translation_counts = defaultdict(Counter)
         self.forms_counts = defaultdict(Counter)
@@ -97,9 +97,10 @@ def _build_iso_mapping(self) -> dict:
         for lang_name, data in language_metadata.items():
             if self.target_iso and lang_name not in self.target_iso:
                 continue
-            iso_code = data.get("iso")
-            if iso_code:
+
+            if iso_code := data.get("iso"):
                 iso_mapping[iso_code] = lang_name
+
         return iso_mapping
 
     # MARK: process total
@@ -116,16 +117,18 @@ def _process_lexeme_total(self, lexeme: dict) -> None:
         if not category_name:
             return
 
-        # Update counters
+        # Update counters.
         lemmas = lexeme.get("lemmas", {})
         for lemma in lemmas.values():
             lang = lemma.get("language")
+
             if lang in self.iso_to_name:
                 self.lexical_category_counts[lang][category_name] += 1
                 translation_count = sum(
                     len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
                 )
                 self.translation_counts[lang][category_name] += translation_count
+
                 break
 
     # MARK: process translations
@@ -144,7 +147,7 @@ def _process_lexeme_translations(self, lexeme: dict) -> None:
         if not category_name:
             return
 
-        # Only store first valid lemma for translations
+        # Only store first valid lemma for translations.
         for lang_code, lemma_data in lemmas.items():
             if lang_code not in self.iso_to_name:
                 continue
@@ -153,7 +156,7 @@ def _process_lexeme_translations(self, lexeme: dict) -> None:
             if not word:
                 continue
 
-            # Build translations from sense glosses
+            # Build translations from sense glosses.
             translations = {}
             for sense in lexeme.get("senses", []):
                 for sense_lang_code, gloss in sense.get("glosses", {}).items():
@@ -162,7 +165,8 @@ def _process_lexeme_translations(self, lexeme: dict) -> None:
 
             if translations:
                 self.translations_index[word][lang_code][category_name] = translations
-            break  # Only handle the first lemma
+
+            break  # only handle the first lemma
 
     # MARK: process forms
     def _process_lexeme_forms(self, lexeme: dict) -> None:
@@ -173,7 +177,7 @@ def _process_lexeme_forms(self, lexeme: dict) -> None:
         lemmas = lexeme.get("lemmas", {})
         lexical_category = lexeme.get("lexicalCategory")
 
-        # Skip if category missing or not recognized
+        # Skip if category missing or not recognized.
         if not lexical_category or lexical_category not in data_type_metadata.values():
             return
 
@@ -183,11 +187,11 @@ def _process_lexeme_forms(self, lexeme: dict) -> None:
             return
 
         # If the category_name is NOT in our data_types list, skip
-        # e.g., category_name = "nouns", but user didn't request "nouns" in data_types
+        # e.g., category_name = "nouns", but user didn't request "nouns" in data_types.
         if category_name not in self.data_types:
             return
 
-        # Process forms
+        # Process forms.
         for lang_code, lemma_data in lemmas.items():
             if lang_code not in self.iso_to_name:
                 continue
@@ -203,29 +207,29 @@ def _process_lexeme_forms(self, lexeme: dict) -> None:
 
                 for rep_lang, rep_data in representations.items():
                     if rep_lang == lang_code:
-                        form_value = rep_data.get("value")
-                        if form_value:
+                        if form_value := rep_data.get("value"):
                             forms_data[form_value].extend(grammatical_features)
 
             if forms_data:
                 self.forms_index[word][lang_code][category_name] = dict(forms_data)
                 self.forms_counts[lang_code][category_name] += len(forms_data)
+
             break  # only first valid lemma
 
     # MARK: process lines
     def process_lines(self, line: str) -> None:
         """
         Process one line of data. Depending on parse_type, we do:
-           - total stats
-           - translations
-           - form categories (filtered by data_types)
+            - total stats
+            - translations
+            - form categories (filtered by data_types)
         """
         try:
             lexeme = orjson.loads(line.strip().rstrip(","))
             if not lexeme:
                 return
 
-            # Get common values once
+            # Get common values once.
             lemmas = lexeme.get("lemmas", {})
             lexical_category = lexeme.get("lexicalCategory")
 
@@ -236,7 +240,7 @@ def process_lines(self, line: str) -> None:
             if not category_name:
                 return
 
-            # Process each type in a single pass through the data
+            # Process each type in a single pass through the data.
             for lang_code, lemma_data in lemmas.items():
                 if lang_code not in self.valid_iso_codes:
                     continue
@@ -256,13 +260,12 @@ def process_lines(self, line: str) -> None:
                     )
 
                 if "translations" in self.parse_type:
-                    translations = {
+                    if translations := {
                         lang: gloss["value"]
                         for sense in lexeme.get("senses", [])
                         for lang, gloss in sense.get("glosses", {}).items()
                         if lang in self.valid_iso_codes
-                    }
-                    if translations:
+                    }:
                         self.translations_index[word][lang_code][category_name] = (
                             translations
                         )
@@ -274,8 +277,7 @@ def process_lines(self, line: str) -> None:
                             "representations", {}
                         ).items():
                             if rep_lang == lang_code:
-                                form_value = rep_data.get("value")
-                                if form_value:
+                                if form_value := rep_data.get("value"):
                                     forms_data[form_value].extend(
                                         form.get("grammaticalFeatures", [])
                                     )
@@ -286,7 +288,7 @@ def process_lines(self, line: str) -> None:
                         )
                         self.forms_counts[lang_code][category_name] += len(forms_data)
 
-                break  # Only process first valid lemma
+                break  # only process first valid lemma
 
         except Exception as e:
             print(f"Error processing line: {e}")
@@ -296,14 +298,14 @@ def process_file(self, file_path: str, batch_size: int = 50000):
         """
         Main loop: read lines from file (bz2) in batches, call process_lines on each.
         """
-        # Use context manager for better resource handling
+        # Use context manager for better resource handling.
         with bz2.open(file_path, "rt", encoding="utf-8") as bzfile:
-            # Skip header if present
+            # Skip header if present.
             first_line = bzfile.readline()
             if not first_line.strip().startswith("["):
                 bzfile.seek(0)
 
-            # Process in larger batches for better performance
+            # Process in larger batches for better performance.
             batch = []
             start_time = time.time()
             total_entries = int(Path(file_path).stat().st_size / 263)
@@ -311,28 +313,29 @@ def process_file(self, file_path: str, batch_size: int = 50000):
             for line in tqdm(bzfile, total=total_entries, desc="Processing entries"):
                 if line.strip() not in ["[", "]", ",", ""]:
                     batch.append(line)
+
                     if len(batch) >= batch_size:
                         self._process_batch(batch)
-                        batch.clear()  # More efficient than creating new list
+                        batch.clear()  # more efficient than creating new list
                     self.stats["processed_entries"] += 1
 
-            # Process remaining items
+            # Process remaining items.
             if batch:
                 self._process_batch(batch)
 
-        # Update stats
+        # Update stats.
         self.stats["processing_time"] = time.time() - start_time
         self.stats["unique_words"] = len(self.forms_index) + len(
             self.translations_index
         )
 
-        # Print summary if "total" was requested
+        # Print summary if "total" was requested.
         if "total" in self.parse_type:
             self._print_total_summary()
 
     def _process_batch(self, batch: list) -> None:
         """
-        Process a batch of lines
+        Process a batch of lines.
         """
         for line in batch:
             self.process_lines(line)
@@ -340,7 +343,7 @@ def _process_batch(self, batch: list) -> None:
     # MARK: print total summary
     def _print_total_summary(self):
         """
-        Print stats if parse_type == total
+        Print stats if parse_type == total.
         """
         print(
             f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25} {'Total Translations':<20}"
@@ -349,15 +352,19 @@ def _print_total_summary(self):
         for lang, counts in self.lexical_category_counts.items():
             lang_name = self.iso_to_name[lang]
             first_row = True
+
             for category, count in counts.most_common():
                 trans_count = self.translation_counts[lang][category]
+
                 if first_row:
                     print(
                         f"{lang_name:<20} {category:<25} {count:<25,} {trans_count:<20,}"
                     )
                     first_row = False
+
                 else:
                     print(f"{'':<20} {category:<25} {count:<25,} {trans_count:<20,}")
+
             if lang != list(self.lexical_category_counts.keys())[-1]:
                 print("\n" + "=" * 90 + "\n")
 
@@ -372,12 +379,12 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
                     f"Warning: ISO {language_iso} unknown, skipping translations export..."
                 )
                 return
-            # Filter
-            filtered = {}
-            for word, lang_data in self.translations_index.items():
-                if language_iso in lang_data:
-                    filtered[word] = {language_iso: lang_data[language_iso]}
 
+            filtered = {
+                word: {language_iso: lang_data[language_iso]}
+                for word, lang_data in self.translations_index.items()
+                if language_iso in lang_data
+            }
             self._save_by_language(filtered, filepath, language_iso, "translations")
 
     # MARK: export forms
@@ -386,8 +393,8 @@ def export_forms_json(
     ) -> None:
         """
         Save forms_index to file, optionally filtering by:
-         - language_iso
-         - data_type (e.g. "nouns", "adverbs")
+            - language_iso
+            - data_type (e.g. "nouns", "adverbs")
 
         If data_type is given, we only export that one category from forms.
         """
@@ -395,10 +402,11 @@ def export_forms_json(
             if language_iso not in self.iso_to_name:
                 print(f"Warning: ISO {language_iso} unknown, skipping forms export...")
                 return
+
             filtered = {}
             for word, lang_data in self.forms_index.items():
                 if language_iso in lang_data:
-                    # If data_type is given, only keep that category
+                    # If data_type is given, only keep that category.
                     if data_type:
                         if data_type in lang_data[language_iso]:
                             filtered[word] = {
@@ -406,15 +414,17 @@ def export_forms_json(
                                     data_type: lang_data[language_iso][data_type]
                                 }
                             }
+
                     else:
                         filtered[word] = {language_iso: lang_data[language_iso]}
+
             self._save_by_language(
                 filtered, filepath, language_iso, data_type or "forms"
             )
 
     def _save_by_language(self, data, filepath, language_iso, category_type):
         """
-        Save data to exports/<langName>/filename
+        Save data to exports/<langName>/filename.
         """
         base_path = Path(filepath)
         lang_name = self.iso_to_name[language_iso]
@@ -437,6 +447,7 @@ def _to_dict(self, dd):
         """
         if isinstance(dd, defaultdict):
             dd = {k: self._to_dict(v) for k, v in dd.items()}
+
         return dd
 
 
@@ -456,18 +467,23 @@ def parse_dump(
     ----------
     language : str or list of str, optional
         Language(s) to parse data for. Must match language names in language_metadata.
+
     parse_type : list of str, optional
         Types of parsing to perform. Valid options are:
         - 'translations': Extract word translations
         - 'form': Extract grammatical forms
         - 'total': Gather statistical totals
+
     data_types : list of str, optional
         Categories to parse when using 'form' type (e.g. ["nouns", "adverbs"]).
         Only used if 'form' is in parse_type.
+
     file_path : str, default="latest-lexemes.json.bz2"
         Path to the lexeme dump file
+
     output_dir : str, optional
         Directory to save output files. If None, uses DEFAULT_DUMP_EXPORT_DIR.
+
     overwrite_all : bool, default=False
         If True, automatically overwrite existing files without prompting
 
@@ -480,11 +496,11 @@ def parse_dump(
     If a requested index file already exists, that language/category combination
     will be skipped.
     """
-    # 1) Prepare environment - Use default if output_dir is None
+    # Prepare environment - Use default if output_dir is None.
     output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
     Path(output_dir).mkdir(parents=True, exist_ok=True)
 
-    # Convert single strings to lists
+    # Convert single strings to lists.
     languages = [language] if isinstance(language, str) else language
     parse_type = parse_type or []
     data_types = data_types or []
@@ -503,20 +519,22 @@ def parse_dump(
         if choice == "Overwrite existing data":
             overwrite_all = True
 
-        # For translations, we only need to check the translations index
+        # For translations, we only need to check the translations index.
         if "translations" in parse_type:
             languages_to_process = []
             for lang in languages:
                 index_path = Path(output_dir) / lang / "lexeme_translations.json"
+
                 if not check_index_exists(index_path, overwrite_all):
                     languages_to_process.append(lang)
+
                 else:
                     print(f"Skipping {lang}/translations.json - already exists")
 
-            # Update languages list but keep data_types as is
+            # Update languages list but keep data_types as is.
             languages = languages_to_process
 
-        # For forms, check each language/data_type combination
+        # For forms, check each language/data_type combination.
         elif "form" in parse_type:
             languages_to_process = []
             data_types_to_process = set()
@@ -525,16 +543,18 @@ def parse_dump(
                 needs_processing = False
                 for data_type in data_types:
                     index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json"
+
                     if not check_index_exists(index_path, overwrite_all):
                         needs_processing = True
                         data_types_to_process.add(data_type)
+
                     else:
                         print(f"Skipping {lang}/{data_type}.json - already exists")
 
                 if needs_processing:
                     languages_to_process.append(lang)
 
-            # Update both lists
+            # Update both lists.
             languages = languages_to_process
             data_types = list(data_types_to_process)
 
@@ -553,11 +573,11 @@ def parse_dump(
 
     # MARK: Handle JSON exports
 
-    # (a) If "translations" in parse_type -> export them
+    # (a) If "translations" in parse_type -> export them.
     if "translations" in parse_type:
         index_path = Path(output_dir) / "lexeme_translations.json"
 
-        # Export translations for each ISO found
+        # Export translations for each ISO found.
         iso_codes = set()
         for word_data in processor.translations_index.values():
             iso_codes.update(word_data.keys())
@@ -565,9 +585,9 @@ def parse_dump(
             if iso_code in processor.iso_to_name:
                 processor.export_translations_json(str(index_path), iso_code)
 
-    # (b) If "form" in parse_type -> export forms for each data_type in data_types
+    # (b) If "form" in parse_type -> export forms for each data_type in data_types.
     if "form" in parse_type:
-        # For each data_type, we create a separate file, e.g. lexeme_nouns.json
+        # For each data_type, we create a separate file, e.g. lexeme_nouns.json.
         for dt in data_types:
             index_path = Path(output_dir) / f"lexeme_{dt}.json"
             print(f"Exporting forms for {dt} to {index_path}...")
diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py
index 5faa486a..6968c8ad 100644
--- a/src/scribe_data/wiktionary/parse_mediaWiki.py
+++ b/src/scribe_data/wiktionary/parse_mediaWiki.py
@@ -1,5 +1,5 @@
 """
-   Functions to parse the translations of a word from MediaWiki API.
+Functions to parse the translations of a word from MediaWiki API.
 
 .. raw:: html
     <!--
@@ -20,21 +20,22 @@
     -->
 """
 
-import re
 import json
-from scribe_data.wikidata.wikidata_utils import mediaWiki_query
+import re
+
 from scribe_data.utils import get_language_from_iso
+from scribe_data.wikidata.wikidata_utils import mediaWiki_query
 
 
 def fetch_translation_page(word):
     data = mediaWiki_query(word)
 
     pages = data.get("query", {}).get("pages", {})
-    # Extract page object from dictionary
+    # Extract page object from dictionary.
     page = next(iter(pages.values())) if pages else {}
-    # Get the wikitext from the 'revisions' key
-    wikitext = page.get("revisions", [{}])[0].get("*", "")
-    return wikitext
+
+    # Get the wikitext from the 'revisions' key.
+    return page.get("revisions", [{}])[0].get("*", "")
 
 
 def parse_wikitext_for_translations(wikitext):
@@ -43,28 +44,29 @@ def parse_wikitext_for_translations(wikitext):
     language codes, part of speech, and context.
     """
     translations_by_lang = {}
-    current_part_of_speech = None  # Track whether we are in Noun or Verb
-    current_context = None  # Track the current trans-top context
+    current_part_of_speech = None  # track whether we are in Noun or Verb
+    current_context = None  # track the current trans-top context
 
     # Split the wikitext into individual lines
     for line in wikitext.splitlines():
-        # Detect part of speech/data-types: Noun or Verb
+        # Detect part of speech/data-types: Noun or Verb.
         if line.startswith("===Noun==="):
             current_part_of_speech = "Noun"
+
         elif line.startswith("===Verb==="):
             current_part_of_speech = "Verb"
-        trans_top_match = re.match(r"\{\{trans-top\|(.+?)\}\}", line)
-        if trans_top_match:
-            current_context = trans_top_match.group(1).strip()
-
-        template_match = re.match(
-            r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", line.strip()
-        )
-        if template_match:
-            lang_code = template_match.group(2).strip()
-            translation_text = template_match.group(3).strip()
-
-            # Ensure there's a list to hold translations for this language
+
+        if trans_top_match := re.match(r"\{\{trans-top\|(.+?)\}\}", line):
+            current_context = trans_top_match[1].strip()
+
+        if template_match := re.match(
+            r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}",
+            line.strip(),
+        ):
+            lang_code = template_match[2].strip()
+            translation_text = template_match[3].strip()
+
+            # Ensure there's a list to hold translations for this language.
             if lang_code not in translations_by_lang:
                 translations_by_lang[lang_code] = []
 
@@ -84,17 +86,17 @@ def build_json_format(word, translations_by_lang):
     Build the final JSON format for the translations of a word.
     """
     book_translations = {word: {}}
-    # Keep counters to number the translations for each (lang, part_of_speech)
+    # Keep counters to number the translations for each (lang, part_of_speech).
     language_counters = {}
 
     for lang_code, entries in translations_by_lang.items():
         try:
             lang_name = get_language_from_iso(lang_code)
         except ValueError:
-            # Skip this language if it's not supported
+            # Skip this language if it's not supported.
             continue
 
-        # Make sure this language is in the dictionary
+        # Make sure this language is in the dictionary.
         if lang_name not in book_translations[word]:
             book_translations[word][lang_name] = {}
 
@@ -109,7 +111,7 @@ def build_json_format(word, translations_by_lang):
 
             idx = str(language_counters[(lang_code, pos)])
 
-            # Insert the item at the next available index
+            # Insert the item at the next available index.
             book_translations[word][lang_name][pos][idx] = {
                 "description": desc,
                 "translations": trans,
diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py
index 29b24751..d4987b22 100644
--- a/tests/cli/test_download.py
+++ b/tests/cli/test_download.py
@@ -127,7 +127,7 @@ def test_wd_lexeme_dump_download_wrapper_latest(
         mock_get.return_value.headers = {"content-length": "100"}
         mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10
 
-        # Mock DEFAULT_DUMP_EXPORT_DIR
+        # Mock DEFAULT_DUMP_EXPORT_DIR.
         with patch(
             "scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir"
         ):
@@ -137,7 +137,7 @@ def test_wd_lexeme_dump_download_wrapper_latest(
             mock_makedirs.assert_called_with("test_export_dir", exist_ok=True)
             mock_confirm.assert_called_once()
 
-    @patch("scribe_data.utils.select")
+    @patch("scribe_data.utils.questionary.select")
     @patch(
         "scribe_data.utils.Path.glob",
         return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")],
@@ -146,7 +146,7 @@ def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select
         """
         Test prompt for using existing lexeme dump files.
         """
-        # Mock the select dialog to return "Use existing latest dump"
+        # Mock the select dialog to return "Use existing latest dump".
         mock_select.return_value.ask.return_value = "Use existing latest dump"
 
         result = check_lexeme_dump_prompt_download(
@@ -154,7 +154,7 @@ def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select
         )
         self.assertEqual(result.name, "latest-lexemes.json.bz2")
 
-    @patch("scribe_data.utils.select")
+    @patch("scribe_data.utils.questionary.select")
     @patch(
         "scribe_data.utils.Path.glob",
         return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")],
@@ -163,7 +163,7 @@ def test_check_lexeme_dump_prompt_download_delete(self, mock_glob, mock_select):
         """
         Test prompt for deleting existing lexeme dump files.
         """
-        # Configure the mock to return "Delete existing dumps" first and then "No"
+        # Configure the mock to return "Delete existing dumps" first and then "No".
         mock_select.side_effect = [
             MagicMock(ask=MagicMock(return_value="Delete existing dumps")),
             MagicMock(ask=MagicMock(return_value="No")),
diff --git a/tests/wikidata/test_check_query.py b/tests/wikidata/test_check_query.py
index e50b3955..19b3097b 100755
--- a/tests/wikidata/test_check_query.py
+++ b/tests/wikidata/test_check_query.py
@@ -25,6 +25,7 @@
 from pathlib import Path
 from unittest.mock import MagicMock, mock_open, patch
 from urllib.error import HTTPError
+
 import pytest
 from scribe_data.wikidata.check_query.check import (
     all_queries,

From f28a176f07ab829ba3232faa7daafada39861d56 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sat, 4 Jan 2025 14:19:19 +0100
Subject: [PATCH 13/13] Minor fix to the contribution guide

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f929c112..d7f767fe 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -300,7 +300,7 @@ def example_function(argument: argument_type) -> return_type:
 
     Parameters
     ----------
-    argument: argument_type
+    argument : argument_type
         Description of your argument.
 
     Returns