From 5a7f273605d6f7ccd77501365a7f9071bc8be7c0 Mon Sep 17 00:00:00 2001 From: axif Date: Sun, 22 Dec 2024 17:33:32 +0600 Subject: [PATCH 01/13] issue 523 done and in check_lexeme_dump_prompt_download added select for better view --- src/scribe_data/cli/main.py | 33 ++++++++++++++++ src/scribe_data/utils.py | 25 ++++++++---- tests/cli/test_download.py | 76 +++++++++++++++++++++++++------------ 3 files changed, 101 insertions(+), 33 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index d4c49f6e..ea8af9c5 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -25,6 +25,7 @@ from pathlib import Path from rich import print as rprint +from questionary import select from scribe_data.cli.cli_utils import validate_language_and_data_type from scribe_data.cli.convert import convert_wrapper @@ -303,6 +304,16 @@ def main() -> None: help="The output directory path for the downloaded dump.", ) + # MARK: Interactive + + interactive_parser = subparsers.add_parser( + "interactive", + aliases=["i"], + help="Run in interactive mode.", + description="Run in interactive mode.", + ) + interactive_parser._actions[0].help = "Show this help message and exit." + # MARK: Setup CLI args = parser.parse_args() @@ -402,6 +413,28 @@ def main() -> None: output_dir=args.output_dir, ) + elif args.command in ["interactive", "i"]: + action = select( + "What would you like to do?", + choices=[ + "Download a Wikidata dump", + "Check for totals", + "Get data", + "Get translations", + "Exit", + ], + ).ask() + + if action == "Download a Wikidata dump": + wd_lexeme_dump_download_wrapper() + elif action == "Check for totals": + start_interactive_mode(operation="total") + elif action == "Get data": + start_interactive_mode(operation="get") + elif action == "Get translations": + print("Coming soon!") + else: + print("Skipping action") else: parser.print_help() diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 933c7623..6f615444 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -31,6 +31,7 @@ from typing import Any, Optional from rich import print as rprint +from questionary import select # MARK: Utils Variables @@ -649,19 +650,27 @@ def check_lexeme_dump_prompt_download(output_dir: str): for dump in existing_dumps: rprint(f" - {Path(output_dir)}/{dump.name}") - user_input = input( - "\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n - Download new version(n)?\n[d/s/u/n]: " - ).lower() - - if user_input == "d": + user_input = select( + "Do you want to:", + choices=[ + "Delete existing dumps", + "Skip download", + "Use existing latest dump", + "Download new version", + ], + ).ask() + + if user_input.startswith("Delete"): for dump in existing_dumps: dump.unlink() rprint("[bold green]Existing dumps deleted.[/bold green]") - user_input = input("Do you want to download latest lexeme dump? (y/N): ") - return user_input != "y" + download_input = select( + "Do you want to download the latest lexeme dump?", choices=["Yes", "No"] + ).ask() + return download_input != "Yes" - elif user_input == "u": + elif user_input.startswith("Use"): # Check for the latest dump file. latest_dump = None if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps): diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py index 5dfa5830..c77c524a 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/test_download.py @@ -101,15 +101,23 @@ def test_download_wd_lexeme_dump_by_date(self, mock_findall, mock_get): ) @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.input", return_value="y") @patch( - "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=None + "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=False ) @patch("scribe_data.cli.download.open", new_callable=mock_open) @patch("scribe_data.cli.download.tqdm") - @patch("scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir") + @patch("scribe_data.cli.download.os.makedirs") + @patch( + "scribe_data.cli.download.input", return_value="y" + ) # Mocking input to return 'y' def test_wd_lexeme_dump_download_wrapper_latest( - self, mock_tqdm, mock_file, mock_check_prompt, mock_input, mock_get + self, + mock_input, + mock_makedirs, + mock_tqdm, + mock_file, + mock_check_prompt, + mock_get, ): """ Test wrapper function for downloading latest Wikidata lexeme dump. @@ -119,35 +127,53 @@ def test_wd_lexeme_dump_download_wrapper_latest( mock_get.return_value.headers = {"content-length": "100"} mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10 - with patch("scribe_data.cli.download.os.makedirs") as mock_makedirs: + # Mock DEFAULT_DUMP_EXPORT_DIR + with patch( + "scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir" + ): download_path = wd_lexeme_dump_download_wrapper() + self.assertIsNotNone(download_path, "Download path should not be None") self.assertIn("latest-lexemes.json.bz2", download_path) mock_makedirs.assert_called_with("test_export_dir", exist_ok=True) + mock_input.assert_called_with( + "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities.\nDo you want to proceed? (y/n): " + ) - def test_check_lexeme_dump_prompt_download_existing(self): + @patch("scribe_data.utils.select") + @patch( + "scribe_data.utils.Path.glob", + return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], + ) + def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select): """ Test prompt for using existing lexeme dump files. """ - with patch( - "scribe_data.utils.Path.glob", - return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], - ): - with patch("builtins.input", return_value="u"): - result = check_lexeme_dump_prompt_download( - "scribe_data/tests/cli/test_export_dir" - ) - self.assertEqual(result.name, "latest-lexemes.json.bz2") + # Mock the select dialog to return "Use existing latest dump" + mock_select.return_value.ask.return_value = "Use existing latest dump" + + result = check_lexeme_dump_prompt_download( + "scribe_data/tests/cli/test_export_dir" + ) + self.assertEqual(result.name, "latest-lexemes.json.bz2") - def test_check_lexeme_dump_prompt_download_delete(self): + @patch("scribe_data.utils.select") + @patch( + "scribe_data.utils.Path.glob", + return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], + ) + def test_check_lexeme_dump_prompt_download_delete(self, mock_glob, mock_select): """ Test prompt for deleting existing lexeme dump files. """ - mock_existing_files = [Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")] - with patch("scribe_data.utils.Path.glob", return_value=mock_existing_files): - with patch("builtins.input", side_effect=["d", "n"]): - with patch("scribe_data.utils.Path.unlink") as mock_unlink: - result = check_lexeme_dump_prompt_download( - "scribe_data/tests/cli/test_export_dir" - ) - self.assertTrue(mock_unlink.called) - self.assertTrue(result) + # Configure the mock to return "Delete existing dumps" first and then "No" + mock_select.side_effect = [ + MagicMock(ask=MagicMock(return_value="Delete existing dumps")), + MagicMock(ask=MagicMock(return_value="No")), + ] + + with patch("scribe_data.utils.Path.unlink") as mock_unlink: + result = check_lexeme_dump_prompt_download( + "scribe_data/tests/cli/test_export_dir" + ) + self.assertTrue(mock_unlink.called) + self.assertTrue(result) From d51549c95ac51fc981adb22371ff8d700a5a31b4 Mon Sep 17 00:00:00 2001 From: axif Date: Wed, 25 Dec 2024 19:20:06 +0600 Subject: [PATCH 02/13] added scribe-data get -l bengali -dt translations -wdp mama -od monu --- .gitignore | 1 + src/scribe_data/cli/download.py | 15 +- src/scribe_data/cli/get.py | 20 +- src/scribe_data/cli/main.py | 6 +- .../resources/data_type_metadata.json | 1 + src/scribe_data/utils.py | 1 - src/scribe_data/wikidata/wikidata_utils.py | 36 ++- src/scribe_data/wiktionary/parse_dump.py | 236 ++++++++++++++++++ tests/cli/test_download.py | 12 +- tests/cli/test_get.py | 12 +- 10 files changed, 302 insertions(+), 38 deletions(-) create mode 100644 src/scribe_data/wiktionary/parse_dump.py diff --git a/.gitignore b/.gitignore index 610b9da8..0f860902 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ scribe_data_tsv_export/* # MARK: Wiki Dumps *.json.bz2 +*.log diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py index 2f741545..4ce478e0 100644 --- a/src/scribe_data/cli/download.py +++ b/src/scribe_data/cli/download.py @@ -30,6 +30,7 @@ import requests from rich import print as rprint from tqdm import tqdm +import questionary from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download @@ -244,16 +245,12 @@ def wd_lexeme_dump_download_wrapper( filename = dump_url.split("/")[-1] output_path = str(Path(output_dir) / filename) - user_response = ( - input( - "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities." - "\nDo you want to proceed? (y/n): " - ) - .strip() - .lower() - ) + user_response = questionary.confirm( + "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities. Do you want to proceed?", + default=True, + ).ask() - if user_response == "y": + if user_response: rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]") response = requests.get(dump_url, stream=True) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index a32b77fe..1aedb8dd 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -26,6 +26,7 @@ from typing import List, Union from rich import print as rprint +import questionary from scribe_data.cli.convert import convert_wrapper from scribe_data.unicode.generate_emoji_keywords import generate_emoji @@ -111,10 +112,10 @@ def prompt_user_download_all(): """ Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call. """ - download_all_input = input( - "Do you want to query Wikidata, or would you rather use Wikidata lexeme dumps? (y/N): " - ) - return download_all_input == "y" + return questionary.confirm( + "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)", + default=False, + ).ask() if all: if language: @@ -164,6 +165,17 @@ def prompt_user_download_all(): elif data_type in {"emoji-keywords", "emoji_keywords"}: generate_emoji(language=language, output_dir=output_dir) + # MARK: Translations + + elif data_type == "translations": + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type="translations", + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + ) + return + # MARK: Query Data elif language or data_type: diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index ea8af9c5..e2a7bb94 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -298,8 +298,8 @@ def main() -> None: help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.", ) download_parser.add_argument( - "-od", - "--output-dir", + "-wdp", + "--wikidata-dump-path", type=str, help="The output directory path for the downloaded dump.", ) @@ -410,7 +410,7 @@ def main() -> None: wikidata_dump=args.wikidata_dump_version if args.wikidata_dump_version != "latest" else None, - output_dir=args.output_dir, + output_dir=args.wikidata_dump_path, ) elif args.command in ["interactive", "i"]: diff --git a/src/scribe_data/resources/data_type_metadata.json b/src/scribe_data/resources/data_type_metadata.json index ff6249f1..4800b0e9 100644 --- a/src/scribe_data/resources/data_type_metadata.json +++ b/src/scribe_data/resources/data_type_metadata.json @@ -11,5 +11,6 @@ "prepositions": "Q4833830", "pronouns": "Q36224", "proper_nouns": "Q147276", + "translations": "Q21112633", "verbs": "Q24905" } diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 6f615444..0381816e 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -694,7 +694,6 @@ def check_lexeme_dump_prompt_download(output_dir: str): latest_dump = max(dated_dumps, key=lambda x: x[1])[0] if latest_dump: - rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}") return latest_dump else: diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index d0fbcc6b..a58b6263 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -20,39 +20,55 @@ --> """ +from pathlib import Path from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper from scribe_data.cli.download import wd_lexeme_dump_download_wrapper +from scribe_data.wiktionary.parse_dump import parse_dump sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(JSON) sparql.setMethod(POST) -def parse_wd_lexeme_dump(wikidata_dump: str = None): +def parse_wd_lexeme_dump( + language: str = None, + wikidata_dump_type: str = None, + type_output_dir: str = None, + wikidata_dump_path: str = None, +): """ Checks for the existence of a Wikidata dump and parses it if possible. Parameters ---------- - wikidata_dump : str + wikidata_dump_path : str The local Wikidata dump that should be used to get data. - + output_dir : str + The directory to save the parsed data. + language : str + The language to parse the data for. Returns ------- The requested data saved locally given file type and location arguments. """ - if wikidata_dump: - wd_lexeme_dump_download_wrapper(None, wikidata_dump) + file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) - else: - file_path = wd_lexeme_dump_download_wrapper() - if isinstance(file_path, str) and file_path: + if isinstance(file_path, (str, Path)): + path = Path(file_path) + if path.exists(): rprint( "[bold green]We'll use the following lexeme dump[/bold green]", file_path, ) - rprint( - "[bold red]Parsing Wikidata lexeme dump feature will be available soon...[/bold red]" + parse_dump( + language=language, + parse_type=wikidata_dump_type, + type_output_dir=type_output_dir, + file_path=file_path, ) + + return + + rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]") diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py new file mode 100644 index 00000000..99c22d7a --- /dev/null +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -0,0 +1,236 @@ +import bz2 +import orjson +from collections import defaultdict +import time +import json +from typing import Dict, Any +from pathlib import Path +import logging +from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR +from scribe_data.utils import language_metadata +from tqdm import tqdm + +# MARK: Logging +logging.basicConfig( + filename="lexeme_processor.log", + filemode="a", + format="%(asctime)s - %(levelname)s - %(message)s", + level=logging.ERROR, +) + + +class LexemeProcessor: + def __init__(self, target_iso: str = None): + self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} + self.target_iso = target_iso + # Used to filter the language_metadata to only include the target language and its sublanguages + # If target_iso is not set, then all languages are included + self.iso_to_name = {} + for lang_name, data in language_metadata.items(): + if lang_name == self.target_iso: + self.iso_to_name[data["iso"]] = lang_name + break + if not self.target_iso: + if "iso" in data: + self.iso_to_name[data["iso"]] = lang_name + elif "sub_languages" in data: + for sublang_data in data["sub_languages"].values(): + if "iso" in sublang_data: + self.iso_to_name[sublang_data["iso"]] = lang_name + + def process_lines_for_translations(self, line: str) -> Dict[str, Any]: + """ + Process a single line of lexeme data. + """ + try: + lexeme = orjson.loads(line.strip().rstrip(",")) + + lemmas = lexeme.get("lemmas", {}) + datatype = lexeme.get("lexicalCategory") + senses = lexeme.get("senses", []) + + # Skip invalid entries + if not lemmas or not datatype: + return {} + + # Get the first lemma + first_lang, first_lemma = next(iter(lemmas.items())) + word = first_lemma.get("value", "").lower() # Normalize to lowercase + word_lang = first_lemma.get("language", "") + + # Skip if word is empty or language ISO is not in our metadata + if not word or word_lang not in self.iso_to_name: + return {} + + # Process all senses and their translations + translations = {} + for sense in senses: + glosses = sense.get("glosses", {}) + translations.update( + { + lang_code: gloss["value"] + for lang_code, gloss in glosses.items() + if lang_code + in self.iso_to_name # Only keep translations for known languages + } + ) + + if not translations: + return {} + + self.word_index[word][word_lang][datatype] = translations + + return {word: {word_lang: {datatype: translations}}} + except Exception as e: + logging.error(f"Error processing line: {e}") + return {} + + def process_file(self, file_path: str, batch_size: int = 1000) -> None: + start_time = time.time() + + try: + # Get file size and estimate number of entries (average 263 bytes per entry based on real data) + total_entries = int(Path(file_path).stat().st_size / 263) + + with bz2.open(file_path, "rt", encoding="utf-8") as bzfile: + first_line = bzfile.readline() + if not first_line.strip().startswith("["): + bzfile.seek(0) + + batch = [] + # Use dynamic total based on file size + for line in tqdm( + bzfile, desc="Processing entries", total=total_entries + ): + stripped_line = line.strip() + if stripped_line in [ + "]", + "[", + ",", + "", + ]: # Skip structural JSON elements + continue + + batch.append(line) + + if len(batch) >= batch_size: + self._process_batch(batch) + batch = [] + + self.stats["processed_entries"] += 1 + + # Process remaining items + if batch: + self._process_batch(batch) + + self.stats["processing_time"] = time.time() - start_time + self.stats["unique_words"] = len(self.word_index) + print( + f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds" + ) + + except FileNotFoundError: + logging.error(f"File not found: {file_path}") + print(f"Error: File not found - {file_path}") + except Exception as e: + logging.error(f"Error processing file: {e}") + print(f"Error processing file: {e}") + + def _process_batch(self, batch: list) -> None: + for line in batch: + # self.process_lines_for_forms(line) + self.process_lines_for_translations(line) + + def save_index(self, filepath: str, language_iso: str = None) -> None: + """ + Save index to file, optionally filtering by language ISO code. + """ + if language_iso: + # Only proceed if we have a valid ISO code + if language_iso not in self.iso_to_name: + print(f"Warning: Unknown ISO code {language_iso}, skipping...") + return + + # Get full language name + full_language_name = self.iso_to_name[language_iso] + + # Filter word_index for specific language + filtered_index = {} + for word, lang_data in self.word_index.items(): + if language_iso in lang_data: + filtered_index[word] = {language_iso: lang_data[language_iso]} + + # Create language-specific filepath using full name + base_path = Path(filepath) + lang_filepath = base_path.parent / full_language_name / base_path.name + lang_filepath.parent.mkdir(parents=True, exist_ok=True) + + print(f"Saving {full_language_name} index to {lang_filepath}...") + with open(lang_filepath, "w", encoding="utf-8") as f: + json.dump(filtered_index, f, indent=2, ensure_ascii=False) + else: + print(f"Saving complete index to {filepath}...") + with open(filepath, "w", encoding="utf-8") as f: + json.dump( + self._convert_defaultdict_to_dict(self.word_index), + f, + indent=2, + ensure_ascii=False, + ) + + def _convert_defaultdict_to_dict(self, dd): + if isinstance(dd, defaultdict): + dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()} + return dd + + def load_index(self, filepath: str) -> None: + print(f"Loading index from {filepath}...") + try: + with open(filepath, "r", encoding="utf-8") as f: + loaded_data = json.load(f) + self.word_index = defaultdict( + lambda: defaultdict(lambda: defaultdict(dict)) + ) + self._recursive_update(self.word_index, loaded_data) + except FileNotFoundError: + logging.error(f"Index file not found: {filepath}") + print(f"Error: Index file not found - {filepath}") + except Exception as e: + logging.error(f"Error loading index: {e}") + print(f"Error loading index: {e}") + + def _recursive_update(self, dd, data): + for key, value in data.items(): + if isinstance(value, dict): + dd[key] = defaultdict(lambda: defaultdict(dict)) + self._recursive_update(dd[key], value) + else: + dd[key] = value + + def get_word_info(self, word: str) -> Dict[str, Any]: + return self.word_index.get(word.lower(), {}) + + +def parse_dump( + language: str = None, + parse_type: str = None, + type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR, + file_path: str = "latest-lexemes.json.bz2", +): + index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + + processor = LexemeProcessor(target_iso=language) + + print("Processing the lexeme data file...") + processor.process_file(file_path) + + # Get unique ISO codes from the processed data + iso_codes = set() + for word_data in processor.word_index.values(): + iso_codes.update(word_data.keys()) + + # Save individual files for each valid language + for iso_code in iso_codes: + if iso_code in processor.iso_to_name: # Only process known ISO codes + processor.save_index(str(index_path), iso_code) diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py index c77c524a..29b24751 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/test_download.py @@ -107,12 +107,10 @@ def test_download_wd_lexeme_dump_by_date(self, mock_findall, mock_get): @patch("scribe_data.cli.download.open", new_callable=mock_open) @patch("scribe_data.cli.download.tqdm") @patch("scribe_data.cli.download.os.makedirs") - @patch( - "scribe_data.cli.download.input", return_value="y" - ) # Mocking input to return 'y' + @patch("scribe_data.cli.download.questionary.confirm") def test_wd_lexeme_dump_download_wrapper_latest( self, - mock_input, + mock_confirm, mock_makedirs, mock_tqdm, mock_file, @@ -122,6 +120,8 @@ def test_wd_lexeme_dump_download_wrapper_latest( """ Test wrapper function for downloading latest Wikidata lexeme dump. """ + mock_confirm.return_value.ask.return_value = True + mock_get.return_value.text = 'href="latest-all.json.bz2"' mock_get.return_value.raise_for_status = MagicMock() mock_get.return_value.headers = {"content-length": "100"} @@ -135,9 +135,7 @@ def test_wd_lexeme_dump_download_wrapper_latest( self.assertIsNotNone(download_path, "Download path should not be None") self.assertIn("latest-lexemes.json.bz2", download_path) mock_makedirs.assert_called_with("test_export_dir", exist_ok=True) - mock_input.assert_called_with( - "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities.\nDo you want to proceed? (y/n): " - ) + mock_confirm.assert_called_once() @patch("scribe_data.utils.select") @patch( diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 2a5e9c4d..f4e46bb0 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -48,8 +48,10 @@ def test_invalid_arguments(self): # MARK: All Data @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_data_types_for_language(self, mock_query_data): + @patch("scribe_data.cli.get.questionary.confirm") + def test_get_all_data_types_for_language(self, mock_confirm, mock_query_data): + mock_confirm.return_value.ask.return_value = False + get_data(all=True, language="English") mock_query_data.assert_called_once_with( languages=["English"], @@ -59,8 +61,10 @@ def test_get_all_data_types_for_language(self, mock_query_data): ) @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_languages_for_data_type(self, mock_query_data): + @patch("scribe_data.cli.get.questionary.confirm") + def test_get_all_languages_for_data_type(self, mock_confirm, mock_query_data): + mock_confirm.return_value.ask.return_value = False + get_data(all=True, data_type="nouns") mock_query_data.assert_called_once_with( languages=None, From bb9e3338fcc1af183a59643522f6cfc7e8b06937 Mon Sep 17 00:00:00 2001 From: axif Date: Thu, 26 Dec 2024 22:59:09 +0600 Subject: [PATCH 03/13] forms total add ( missing total translations need feedback) --- src/scribe_data/cli/main.py | 3 +- src/scribe_data/cli/total.py | 30 +++- src/scribe_data/wiktionary/parse_dump.py | 186 +++++++++++++++++------ 3 files changed, 165 insertions(+), 54 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index e2a7bb94..57ac7973 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -200,7 +200,8 @@ def main() -> None: total_parser.add_argument( "-wdp", "--wikidata-dump-path", - type=str, + nargs="?", + const=True, help="Path to a local Wikidata lexemes dump for running with '--all'.", ) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 71881dda..37016c97 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -36,6 +36,7 @@ list_all_languages, ) from scribe_data.wikidata.wikidata_utils import sparql +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump def get_qid_by_input(input_str): @@ -370,7 +371,7 @@ def total_wrapper( language: Union[str, List[str]] = None, data_type: Union[str, List[str]] = None, all_bool: bool = False, - wikidata_dump: str = None, + wikidata_dump: Union[str, bool] = None, ) -> None: """ Conditionally provides the full functionality of the total command. @@ -387,10 +388,33 @@ def total_wrapper( all_bool : boolean Whether all languages and data types should be listed. - wikidata_dump : str - The local Wikidata dump that can be used to process data. + wikidata_dump : Union[str, bool] + The local Wikidata dump path that can be used to process data. + If True, indicates the flag was used without a path. """ + if wikidata_dump is True: # flag without a wikidata dump path + if all_bool: + language = "all" + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type="total", + type_output_dir=None, + wikidata_dump_path=None, + ) + return + + if isinstance(wikidata_dump, str): # if user provided a wikidata dump path + if all_bool: + language = "all" + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type="total", + type_output_dir=None, + wikidata_dump_path=wikidata_dump, + ) + return + if (not language and not data_type) and all_bool: print_total_lexemes() diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 99c22d7a..69b2782d 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -9,6 +9,9 @@ from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR from scribe_data.utils import language_metadata from tqdm import tqdm +from collections import Counter + +from scribe_data.utils import data_type_metadata # MARK: Logging logging.basicConfig( @@ -20,10 +23,12 @@ class LexemeProcessor: - def __init__(self, target_iso: str = None): + def __init__(self, target_iso: str = None, parse_type: str = None): self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} self.target_iso = target_iso + self.parse_type = parse_type + self.lexical_category_counts = defaultdict(Counter) # Used to filter the language_metadata to only include the target language and its sublanguages # If target_iso is not set, then all languages are included self.iso_to_name = {} @@ -39,49 +44,89 @@ def __init__(self, target_iso: str = None): if "iso" in sublang_data: self.iso_to_name[sublang_data["iso"]] = lang_name - def process_lines_for_translations(self, line: str) -> Dict[str, Any]: + def _process_lexeme_translations(self, lexeme: dict) -> dict: """ - Process a single line of lexeme data. + Process lexeme translations from lemmas, datatype and senses. + Returns a dictionary with word translations or empty dict if invalid. """ - try: - lexeme = orjson.loads(line.strip().rstrip(",")) + lemmas = lexeme.get("lemmas", {}) + datatype = lexeme.get("lexicalCategory") + senses = lexeme.get("senses", []) + + # Skip invalid entries + if not lemmas or not datatype: + return {} + + # Get the first lemma + first_lang, first_lemma = next(iter(lemmas.items())) + word = first_lemma.get("value", "").lower() # Normalize to lowercase + word_lang = first_lemma.get("language", "") + + # Skip if word is empty or language ISO is not in our metadata + if not word or word_lang not in self.iso_to_name: + return {} + + # Process all senses and their translations + translations = {} + for sense in senses: + glosses = sense.get("glosses", {}) + translations.update( + { + lang_code: gloss["value"] + for lang_code, gloss in glosses.items() + if lang_code + in self.iso_to_name # Only keep translations for known languages + } + ) + + if not translations: + return {} + + self.word_index[word][word_lang][datatype] = translations + return {word: {word_lang: {datatype: translations}}} - lemmas = lexeme.get("lemmas", {}) - datatype = lexeme.get("lexicalCategory") - senses = lexeme.get("senses", []) - - # Skip invalid entries - if not lemmas or not datatype: - return {} - - # Get the first lemma - first_lang, first_lemma = next(iter(lemmas.items())) - word = first_lemma.get("value", "").lower() # Normalize to lowercase - word_lang = first_lemma.get("language", "") - - # Skip if word is empty or language ISO is not in our metadata - if not word or word_lang not in self.iso_to_name: - return {} - - # Process all senses and their translations - translations = {} - for sense in senses: - glosses = sense.get("glosses", {}) - translations.update( - { - lang_code: gloss["value"] - for lang_code, gloss in glosses.items() - if lang_code - in self.iso_to_name # Only keep translations for known languages - } + def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]: + """ + Process lexeme forms from lemmas, datatype and senses. + Returns a dictionary with word translations or empty dict if invalid. + """ + + lexicalCategory = lexeme.get("lexicalCategory") + + # Skip if lexicalCategory is missing or not in our data types + if not lexicalCategory or lexicalCategory not in data_type_metadata.values(): + return {} + lemmas = lexeme.get("lemmas", {}) + + for lemma in lemmas.values(): + lang = lemma.get("language") + if lang in self.iso_to_name: + # Convert QID to category name + category_name = next( + ( + key + for key, qid in data_type_metadata.items() + if qid == lexicalCategory + ), + None, ) + if category_name: + # Store counts per language + self.lexical_category_counts[lang][category_name] += 1 + break - if not translations: - return {} + def process_lines(self, line: str) -> Dict[str, Any]: + """ + Process a single line of lexeme data. + """ + try: + lexeme = orjson.loads(line.strip().rstrip(",")) - self.word_index[word][word_lang][datatype] = translations + if self.parse_type == "translations": + return self._process_lexeme_translations(lexeme) + elif self.parse_type == "total": + return self._process_lexeme_total(lexeme) - return {word: {word_lang: {datatype: translations}}} except Exception as e: logging.error(f"Error processing line: {e}") return {} @@ -129,6 +174,30 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: print( f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds" ) + if self.parse_type == "total": + print( + f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" + ) + print("=" * 70) + + # Print counts for each language + for lang, counts in self.lexical_category_counts.items(): + lang_name = self.iso_to_name[lang] + # Print first row with language name + first_category = True + for category, count in counts.most_common(): + if first_category: + print(f"{lang_name:<20} {category:<25} {count:<25,}") + first_category = False + else: + # Print subsequent rows with blank language column + print(f"{'':<20} {category:<25} {count:<25,}") + # Add blank line between languages, but not after the last language + if lang != list(self.lexical_category_counts.keys())[-1]: + print( + f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" + ) + print("=" * 70) except FileNotFoundError: logging.error(f"File not found: {file_path}") @@ -140,7 +209,7 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: def _process_batch(self, batch: list) -> None: for line in batch: # self.process_lines_for_forms(line) - self.process_lines_for_translations(line) + self.process_lines(line) def save_index(self, filepath: str, language_iso: str = None) -> None: """ @@ -218,19 +287,36 @@ def parse_dump( type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR, file_path: str = "latest-lexemes.json.bz2", ): - index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + if parse_type == "total": + if language == "all": + print("Processing all lexemes...") + processor = LexemeProcessor(target_iso=None, parse_type=parse_type) + else: + print(f"Processing lexemes for {language}...") + processor = LexemeProcessor(target_iso=language, parse_type=parse_type) + + processor.process_file(file_path) + + else: + # Create the output directory if it doesn't exist + Path(type_output_dir).mkdir(parents=True, exist_ok=True) + + index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + print(f"Will save index to: {index_path}") + + processor = LexemeProcessor(target_iso=language, parse_type=parse_type) - processor = LexemeProcessor(target_iso=language) + print("Processing the lexeme data file...") + processor.process_file(file_path) - print("Processing the lexeme data file...") - processor.process_file(file_path) + print(f"Found {len(processor.word_index)} words in total") - # Get unique ISO codes from the processed data - iso_codes = set() - for word_data in processor.word_index.values(): - iso_codes.update(word_data.keys()) + # Get unique ISO codes from the processed data + iso_codes = set() + for word_data in processor.word_index.values(): + iso_codes.update(word_data.keys()) - # Save individual files for each valid language - for iso_code in iso_codes: - if iso_code in processor.iso_to_name: # Only process known ISO codes - processor.save_index(str(index_path), iso_code) + # Save individual files for each valid language + for iso_code in iso_codes: + if iso_code in processor.iso_to_name: # Only process known ISO codes + processor.save_index(str(index_path), iso_code) From 7eddf072e384b640cf06f984657b95b88c6eee6d Mon Sep 17 00:00:00 2001 From: axif Date: Fri, 27 Dec 2024 03:12:33 +0600 Subject: [PATCH 04/13] translation add in total, remove loggings --- src/scribe_data/wiktionary/parse_dump.py | 156 ++++++++++++----------- 1 file changed, 79 insertions(+), 77 deletions(-) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 69b2782d..8ad9adfe 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -5,7 +5,6 @@ import json from typing import Dict, Any from pathlib import Path -import logging from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR from scribe_data.utils import language_metadata from tqdm import tqdm @@ -13,77 +12,75 @@ from scribe_data.utils import data_type_metadata -# MARK: Logging -logging.basicConfig( - filename="lexeme_processor.log", - filemode="a", - format="%(asctime)s - %(levelname)s - %(message)s", - level=logging.ERROR, -) - class LexemeProcessor: def __init__(self, target_iso: str = None, parse_type: str = None): - self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + self.word_index = {} self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} self.target_iso = target_iso self.parse_type = parse_type - self.lexical_category_counts = defaultdict(Counter) - # Used to filter the language_metadata to only include the target language and its sublanguages - # If target_iso is not set, then all languages are included - self.iso_to_name = {} + self.lexical_category_counts = {} + self.translation_counts = {} + self._category_lookup = {v: k for k, v in data_type_metadata.items() if v} + self.iso_to_name = self._build_iso_mapping() + + def _build_iso_mapping(self) -> dict: + """Build mapping of ISO codes to language names""" + iso_mapping = {} + for lang_name, data in language_metadata.items(): - if lang_name == self.target_iso: - self.iso_to_name[data["iso"]] = lang_name - break - if not self.target_iso: - if "iso" in data: - self.iso_to_name[data["iso"]] = lang_name - elif "sub_languages" in data: - for sublang_data in data["sub_languages"].values(): - if "iso" in sublang_data: - self.iso_to_name[sublang_data["iso"]] = lang_name + if self.target_iso and lang_name != self.target_iso: + continue + + if "iso" in data: + iso_mapping[data["iso"]] = lang_name + + if not self.target_iso and "sub_languages" in data: + for sublang_data in data["sub_languages"].values(): + if "iso" in sublang_data: + iso_mapping[sublang_data["iso"]] = lang_name + + return iso_mapping def _process_lexeme_translations(self, lexeme: dict) -> dict: - """ - Process lexeme translations from lemmas, datatype and senses. - Returns a dictionary with word translations or empty dict if invalid. - """ + """Process lexeme translations from lemmas and senses""" lemmas = lexeme.get("lemmas", {}) - datatype = lexeme.get("lexicalCategory") - senses = lexeme.get("senses", []) + q_code = lexeme.get("lexicalCategory") - # Skip invalid entries - if not lemmas or not datatype: + # Convert Q-code to actual category name (e.g., Q1084 -> nouns) + category_name = self._category_lookup.get(q_code) + + if not (lemmas and category_name): return {} - # Get the first lemma - first_lang, first_lemma = next(iter(lemmas.items())) - word = first_lemma.get("value", "").lower() # Normalize to lowercase - word_lang = first_lemma.get("language", "") + try: + first_lang, first_lemma = next(iter(lemmas.items())) + word = first_lemma.get("value", "").lower() + word_lang = first_lemma.get("language", "") - # Skip if word is empty or language ISO is not in our metadata - if not word or word_lang not in self.iso_to_name: - return {} + if not (word and word_lang in self.iso_to_name): + return {} - # Process all senses and their translations - translations = {} - for sense in senses: - glosses = sense.get("glosses", {}) - translations.update( - { - lang_code: gloss["value"] - for lang_code, gloss in glosses.items() - if lang_code - in self.iso_to_name # Only keep translations for known languages - } - ) + translations = { + lang_code: gloss["value"] + for sense in lexeme.get("senses", []) + for lang_code, gloss in sense.get("glosses", {}).items() + if lang_code in self.iso_to_name + } - if not translations: - return {} + if translations: + if word not in self.word_index: + self.word_index[word] = {} + if word_lang not in self.word_index[word]: + self.word_index[word][word_lang] = {} - self.word_index[word][word_lang][datatype] = translations - return {word: {word_lang: {datatype: translations}}} + self.word_index[word][word_lang][category_name] = translations + return {word: {word_lang: {category_name: translations}}} + + except (StopIteration, AttributeError): + pass + + return {} def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]: """ @@ -98,23 +95,27 @@ def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]: return {} lemmas = lexeme.get("lemmas", {}) + category_name = self._category_lookup.get(lexicalCategory) + if not category_name: + return {} + + # Process only the first valid language entry for lemma in lemmas.values(): lang = lemma.get("language") if lang in self.iso_to_name: - # Convert QID to category name - category_name = next( - ( - key - for key, qid in data_type_metadata.items() - if qid == lexicalCategory - ), - None, + if lang not in self.lexical_category_counts: + self.lexical_category_counts[lang] = Counter() + self.translation_counts[lang] = Counter() + # Update counts + self.lexical_category_counts[lang][category_name] += 1 + translation_count = sum( + len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) ) - if category_name: - # Store counts per language - self.lexical_category_counts[lang][category_name] += 1 + self.translation_counts[lang][category_name] += translation_count break + return {} + def process_lines(self, line: str) -> Dict[str, Any]: """ Process a single line of lexeme data. @@ -128,7 +129,7 @@ def process_lines(self, line: str) -> Dict[str, Any]: return self._process_lexeme_total(lexeme) except Exception as e: - logging.error(f"Error processing line: {e}") + print(f"Error processing line: {e}") return {} def process_file(self, file_path: str, batch_size: int = 1000) -> None: @@ -176,9 +177,9 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: ) if self.parse_type == "total": print( - f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" + f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}" ) - print("=" * 70) + print("=" * 90) # Print counts for each language for lang, counts in self.lexical_category_counts.items(): @@ -186,24 +187,27 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: # Print first row with language name first_category = True for category, count in counts.most_common(): + translation_count = self.translation_counts[lang][category] if first_category: - print(f"{lang_name:<20} {category:<25} {count:<25,}") + print( + f"{lang_name:<20} {category:<25} {count:<25,} {translation_count:<20,}" + ) first_category = False else: # Print subsequent rows with blank language column - print(f"{'':<20} {category:<25} {count:<25,}") + print( + f"{'':<20} {category:<25} {count:<25,} {translation_count:<20,}" + ) # Add blank line between languages, but not after the last language if lang != list(self.lexical_category_counts.keys())[-1]: print( - f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}" + f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}" ) - print("=" * 70) + print("=" * 90) except FileNotFoundError: - logging.error(f"File not found: {file_path}") print(f"Error: File not found - {file_path}") except Exception as e: - logging.error(f"Error processing file: {e}") print(f"Error processing file: {e}") def _process_batch(self, batch: list) -> None: @@ -263,10 +267,8 @@ def load_index(self, filepath: str) -> None: ) self._recursive_update(self.word_index, loaded_data) except FileNotFoundError: - logging.error(f"Index file not found: {filepath}") print(f"Error: Index file not found - {filepath}") except Exception as e: - logging.error(f"Error loading index: {e}") print(f"Error loading index: {e}") def _recursive_update(self, dd, data): From 7c4d597eea54570ef4dc314b7b678705456db362 Mon Sep 17 00:00:00 2001 From: axif Date: Sat, 28 Dec 2024 02:37:06 +0600 Subject: [PATCH 05/13] overwrite file feature added --- src/scribe_data/cli/main.py | 3 +++ src/scribe_data/wiktionary/parse_dump.py | 26 +++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 57ac7973..281a05f2 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -415,6 +415,9 @@ def main() -> None: ) elif args.command in ["interactive", "i"]: + rprint( + f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]" + ) action = select( "What would you like to do?", choices=[ diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 8ad9adfe..10d7b2da 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -9,6 +9,7 @@ from scribe_data.utils import language_metadata from tqdm import tqdm from collections import Counter +import questionary from scribe_data.utils import data_type_metadata @@ -283,6 +284,19 @@ def get_word_info(self, word: str) -> Dict[str, Any]: return self.word_index.get(word.lower(), {}) +def check_index_exists(index_path: Path) -> bool: + """Check if index file exists and prompt user for action if it does.""" + if index_path.exists(): + print(f"\nIndex file already exists at: {index_path}") + choice = questionary.select( + "Choose an action:", + choices=["Overwrite existing data", "Skip process"], + default="Skip process", + ).ask() + return choice == "Skip process" + return False + + def parse_dump( language: str = None, parse_type: str = None, @@ -303,7 +317,17 @@ def parse_dump( # Create the output directory if it doesn't exist Path(type_output_dir).mkdir(parents=True, exist_ok=True) - index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + if language: + index_path = ( + Path(type_output_dir) / language / f"lexeme_index_{parse_type}.json" + ) + if check_index_exists(index_path): + return + else: + index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + if check_index_exists(index_path): + return + print(f"Will save index to: {index_path}") processor = LexemeProcessor(target_iso=language, parse_type=parse_type) From 877e6b2485a8ef5ba8cfd53e0f48ebf119f81933 Mon Sep 17 00:00:00 2001 From: axif Date: Sat, 28 Dec 2024 17:25:01 +0600 Subject: [PATCH 06/13] final and clean code --- .gitignore | 1 - src/scribe_data/utils.py | 16 ++ src/scribe_data/wikidata/wikidata_utils.py | 14 +- src/scribe_data/wiktionary/parse_dump.py | 187 +++++++++++---------- 4 files changed, 125 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 0f860902..610b9da8 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,3 @@ scribe_data_tsv_export/* # MARK: Wiki Dumps *.json.bz2 -*.log diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 0381816e..36f25877 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -25,6 +25,7 @@ import json import os import re +import questionary from datetime import datetime from importlib import resources from pathlib import Path @@ -703,3 +704,18 @@ def check_lexeme_dump_prompt_download(output_dir: str): else: rprint("[bold blue]Skipping download.[/bold blue]") return True + + +def check_index_exists(index_path: Path) -> bool: + """ + Check if JSON wiktionary dump file exists and prompt user for action if it does. + """ + if index_path.exists(): + print(f"\nIndex file already exists at: {index_path}") + choice = questionary.select( + "Choose an action:", + choices=["Overwrite existing data", "Skip process"], + default="Skip process", + ).ask() + return choice == "Skip process" + return False diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index a58b6263..7ab46f7a 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -43,12 +43,14 @@ def parse_wd_lexeme_dump( Parameters ---------- - wikidata_dump_path : str - The local Wikidata dump that should be used to get data. - output_dir : str - The directory to save the parsed data. - language : str - The language to parse the data for. + language : str + The language to parse the data for. + wikidata_dump_type : str + The type of Wikidata dump to parse (e.g. "total", "translations"). + type_output_dir : str + The directory to save the parsed JSON data. + wikidata_dump_path : str + The local Wikidata dump directory that should be used to get data. Returns ------- The requested data saved locally given file type and location arguments. diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 10d7b2da..e4a121f5 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -1,27 +1,51 @@ +""" +Functions for parsing Wikidata lexeme dumps. + +.. raw:: html + +""" + import bz2 import orjson -from collections import defaultdict import time import json -from typing import Dict, Any -from pathlib import Path -from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR -from scribe_data.utils import language_metadata -from tqdm import tqdm -from collections import Counter -import questionary -from scribe_data.utils import data_type_metadata +from tqdm import tqdm +from pathlib import Path +from collections import defaultdict, Counter +from typing import Dict, Any +from scribe_data.utils import ( + DEFAULT_DUMP_EXPORT_DIR, + language_metadata, + data_type_metadata, + check_index_exists, +) class LexemeProcessor: def __init__(self, target_iso: str = None, parse_type: str = None): - self.word_index = {} + # Pre-compute lookups once during initialization + self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} self.target_iso = target_iso self.parse_type = parse_type - self.lexical_category_counts = {} - self.translation_counts = {} + self.lexical_category_counts = defaultdict(Counter) + self.translation_counts = defaultdict(Counter) self._category_lookup = {v: k for k, v in data_type_metadata.items() if v} self.iso_to_name = self._build_iso_mapping() @@ -46,40 +70,37 @@ def _build_iso_mapping(self) -> dict: def _process_lexeme_translations(self, lexeme: dict) -> dict: """Process lexeme translations from lemmas and senses""" lemmas = lexeme.get("lemmas", {}) - q_code = lexeme.get("lexicalCategory") + qid = lexeme.get("lexicalCategory") - # Convert Q-code to actual category name (e.g., Q1084 -> nouns) - category_name = self._category_lookup.get(q_code) - - if not (lemmas and category_name): + # Early return if missing required data + if not (lemmas and qid): + return {} + # Convert Qid to actual category name (e.g., Q1084 -> nouns) + category_name = self._category_lookup.get(qid) + if not category_name: return {} - try: - first_lang, first_lemma = next(iter(lemmas.items())) - word = first_lemma.get("value", "").lower() - word_lang = first_lemma.get("language", "") + # Process first valid lemma + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.iso_to_name: + continue - if not (word and word_lang in self.iso_to_name): - return {} + word = lemma_data.get("value", "").lower() + if not word: + continue - translations = { - lang_code: gloss["value"] - for sense in lexeme.get("senses", []) - for lang_code, gloss in sense.get("glosses", {}).items() - if lang_code in self.iso_to_name - } + # Collect all valid translations in one pass + translations = {} + for sense in lexeme.get("senses", []): + for lang_code, gloss in sense.get("glosses", {}).items(): + if lang_code in self.iso_to_name: + translations[lang_code] = gloss["value"] if translations: - if word not in self.word_index: - self.word_index[word] = {} - if word_lang not in self.word_index[word]: - self.word_index[word][word_lang] = {} - - self.word_index[word][word_lang][category_name] = translations - return {word: {word_lang: {category_name: translations}}} + self.word_index[word][lang_code][category_name] = translations + return {word: {lang_code: {category_name: translations}}} - except (StopIteration, AttributeError): - pass + break # Process only first valid lemma return {} @@ -133,7 +154,7 @@ def process_lines(self, line: str) -> Dict[str, Any]: print(f"Error processing line: {e}") return {} - def process_file(self, file_path: str, batch_size: int = 1000) -> None: + def process_file(self, file_path: str, batch_size: int = 50000): start_time = time.time() try: @@ -146,7 +167,6 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: bzfile.seek(0) batch = [] - # Use dynamic total based on file size for line in tqdm( bzfile, desc="Processing entries", total=total_entries ): @@ -212,11 +232,13 @@ def process_file(self, file_path: str, batch_size: int = 1000) -> None: print(f"Error processing file: {e}") def _process_batch(self, batch: list) -> None: + """ + Process multiple lines at once + """ for line in batch: - # self.process_lines_for_forms(line) self.process_lines(line) - def save_index(self, filepath: str, language_iso: str = None) -> None: + def export_json(self, filepath: str, language_iso: str = None) -> None: """ Save index to file, optionally filtering by language ISO code. """ @@ -235,8 +257,13 @@ def save_index(self, filepath: str, language_iso: str = None) -> None: if language_iso in lang_data: filtered_index[word] = {language_iso: lang_data[language_iso]} - # Create language-specific filepath using full name + # Create language-specific filepath, removing potential double paths base_path = Path(filepath) + # Remove language name from base_path if it exists to prevent duplication + if full_language_name in base_path.parts: + parts = [p for p in base_path.parts if p != full_language_name] + base_path = Path(*parts) + lang_filepath = base_path.parent / full_language_name / base_path.name lang_filepath.parent.mkdir(parents=True, exist_ok=True) @@ -258,44 +285,6 @@ def _convert_defaultdict_to_dict(self, dd): dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()} return dd - def load_index(self, filepath: str) -> None: - print(f"Loading index from {filepath}...") - try: - with open(filepath, "r", encoding="utf-8") as f: - loaded_data = json.load(f) - self.word_index = defaultdict( - lambda: defaultdict(lambda: defaultdict(dict)) - ) - self._recursive_update(self.word_index, loaded_data) - except FileNotFoundError: - print(f"Error: Index file not found - {filepath}") - except Exception as e: - print(f"Error loading index: {e}") - - def _recursive_update(self, dd, data): - for key, value in data.items(): - if isinstance(value, dict): - dd[key] = defaultdict(lambda: defaultdict(dict)) - self._recursive_update(dd[key], value) - else: - dd[key] = value - - def get_word_info(self, word: str) -> Dict[str, Any]: - return self.word_index.get(word.lower(), {}) - - -def check_index_exists(index_path: Path) -> bool: - """Check if index file exists and prompt user for action if it does.""" - if index_path.exists(): - print(f"\nIndex file already exists at: {index_path}") - choice = questionary.select( - "Choose an action:", - choices=["Overwrite existing data", "Skip process"], - default="Skip process", - ).ask() - return choice == "Skip process" - return False - def parse_dump( language: str = None, @@ -303,6 +292,34 @@ def parse_dump( type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR, file_path: str = "latest-lexemes.json.bz2", ): + """ + Process and parse Wikidata lexeme dumps, either analyzing all + or filtering for a specific language. + + Parameters + ---------- + language : str, + ISO code of the language to process. If 'all', processes all languages. + parse_type : str + Type of parsing to perform. Options are: + - 'total': Generate statistics about lexeme counts + - 'translations': Create translation indexes + type_output_dir : str + Directory where output files will be saved. Defaults to DEFAULT_DUMP_EXPORT_DIR. + file_path : str + Path to the lexeme dump file. Defaults to 'latest-lexemes.json.bz2'. + + Notes + ----- + When parse_type is 'total': + - Total number of lexemes per language along with different lexical categories + - Number of total translations available + + When parse_type is 'translations', it creates JSON index files containing: + - Word-to-translation mappings + - Lexical category information + + """ if parse_type == "total": if language == "all": print("Processing all lexemes...") @@ -318,13 +335,11 @@ def parse_dump( Path(type_output_dir).mkdir(parents=True, exist_ok=True) if language: - index_path = ( - Path(type_output_dir) / language / f"lexeme_index_{parse_type}.json" - ) + index_path = Path(type_output_dir) / language / f"lexeme_{parse_type}.json" if check_index_exists(index_path): return else: - index_path = Path(type_output_dir) / f"lexeme_index_{parse_type}.json" + index_path = Path(type_output_dir) / f"lexeme_{parse_type}.json" if check_index_exists(index_path): return @@ -345,4 +360,4 @@ def parse_dump( # Save individual files for each valid language for iso_code in iso_codes: if iso_code in processor.iso_to_name: # Only process known ISO codes - processor.save_index(str(index_path), iso_code) + processor.export_json(str(index_path), iso_code) From 69f4bc70bd47eee2fb2c886a3b7a0a545f9be116 Mon Sep 17 00:00:00 2001 From: axif Date: Sat, 28 Dec 2024 18:39:19 +0600 Subject: [PATCH 07/13] removed orjjson --- src/scribe_data/wiktionary/parse_dump.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index e4a121f5..1e15971f 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -21,9 +21,8 @@ """ import bz2 -import orjson -import time import json +import time from tqdm import tqdm from pathlib import Path @@ -143,7 +142,7 @@ def process_lines(self, line: str) -> Dict[str, Any]: Process a single line of lexeme data. """ try: - lexeme = orjson.loads(line.strip().rstrip(",")) + lexeme = json.loads(line.strip().rstrip(",")) if self.parse_type == "translations": return self._process_lexeme_translations(lexeme) From 612ebe5d736843a762d66dde59e514307ffdfe36 Mon Sep 17 00:00:00 2001 From: axif Date: Mon, 30 Dec 2024 01:46:32 +0600 Subject: [PATCH 08/13] Add orjson dependency and add forms & boost interactive mood --- requirements.txt | 1 + src/scribe_data/cli/get.py | 38 +- src/scribe_data/cli/interactive.py | 44 ++ src/scribe_data/cli/main.py | 2 +- src/scribe_data/cli/total.py | 13 +- src/scribe_data/utils.py | 13 +- src/scribe_data/wikidata/wikidata_utils.py | 41 +- src/scribe_data/wiktionary/parse_dump.py | 683 ++++++++++++++------- tests/cli/test_get.py | 71 ++- 9 files changed, 611 insertions(+), 295 deletions(-) diff --git a/requirements.txt b/requirements.txt index abbd5e44..4e1d6d55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ ruff>=0.3.3 SPARQLWrapper>=2.0.0 sphinx-rtd-theme>=3.0.0 tqdm==4.66.4 +orjson>=3.10.12 diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 4bfa0f37..118badb0 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -117,8 +117,12 @@ def prompt_user_download_all(): if all_bool: if language: if prompt_user_download_all(): - parse_wd_lexeme_dump() - + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["form"], + data_types=data_types, + type_output_dir=output_dir, + ) else: language_or_sub_language = language.split(" ")[0] print(f"Updating all data types for language: {language.title()}") @@ -134,8 +138,12 @@ def prompt_user_download_all(): elif data_type: if prompt_user_download_all(): - parse_wd_lexeme_dump() - + parse_wd_lexeme_dump( + language=None, + wikidata_dump_type=["form"], + data_types=[data_type], + type_output_dir=output_dir, + ) else: print(f"Updating all languages for data type: {data_type.capitalize()}") query_data( @@ -153,7 +161,13 @@ def prompt_user_download_all(): rprint( "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]" ) - parse_wd_lexeme_dump() + parse_wd_lexeme_dump( + language="all", + wikidata_dump_type=["form", "translations"], + data_types="all", + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + ) # MARK: Emojis @@ -165,7 +179,19 @@ def prompt_user_download_all(): elif data_type == "translations": parse_wd_lexeme_dump( language=language, - wikidata_dump_type="translations", + wikidata_dump_type=["translations"], + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + ) + return + + # MARK: Query Data using Wikidata Dump + + elif wikidata_dump: + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["form"], + data_types=data_types, type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, ) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 73e12426..8cfea57c 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -38,8 +38,10 @@ from scribe_data.cli.get import get_data from scribe_data.cli.total import total_wrapper from scribe_data.cli.version import get_version_message +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, + DEFAULT_DUMP_EXPORT_DIR, data_type_metadata, language_metadata, list_all_languages, @@ -262,6 +264,7 @@ def request_total_lexeme_loop(): choices=[ Choice("Configure total lexemes request", "total"), Choice("Run total lexemes request", "run"), + Choice("Run total lexemes request with lexeme dumps", "run_all"), Choice("Exit", "exit"), ], ).ask() @@ -275,6 +278,18 @@ def request_total_lexeme_loop(): config.selected_languages, config.selected_data_types = [], [] rprint(THANK_YOU_MESSAGE) break + elif choice == "run_all": + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + parse_wd_lexeme_dump( + language=config.selected_languages, + wikidata_dump_type=["total"], + wikidata_dump_path=wikidata_dump_path, + ) + break elif choice == "exit": return else: @@ -335,6 +350,12 @@ def start_interactive_mode(operation: str = None): # Choice("See list of languages", "languages"), Choice("Exit", "exit"), ] + elif operation == "translations": + choices = [ + Choice("Configure translations request", "translations"), + # Choice("See list of languages", "languages"), + Choice("Exit", "exit"), + ] else: choices = [ @@ -358,6 +379,29 @@ def start_interactive_mode(operation: str = None): request_total_lexeme_loop() break + elif choice == "translations": + prompt_for_languages() + + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + if output_dir := prompt( + f"Enter output directory (default: {config.output_dir}): " + ): + config.output_dir = Path(output_dir) + + parse_wd_lexeme_dump( + language=config.selected_languages, + wikidata_dump_type=["translations"], + data_types=None, + type_output_dir=config.output_dir, + wikidata_dump_path=wikidata_dump_path, + ) + + break + # elif choice == "languages": # see_list_languages() # break diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 3f7d045e..f696b238 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -436,7 +436,7 @@ def main() -> None: elif action == "Get data": start_interactive_mode(operation="get") elif action == "Get translations": - print("Coming soon!") + start_interactive_mode(operation="translations") else: print("Skipping action") else: diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 37016c97..e543256e 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -392,25 +392,22 @@ def total_wrapper( The local Wikidata dump path that can be used to process data. If True, indicates the flag was used without a path. """ + # Handle --all flag + if all_bool and wikidata_dump: + language = "all" if wikidata_dump is True: # flag without a wikidata dump path - if all_bool: - language = "all" parse_wd_lexeme_dump( language=language, - wikidata_dump_type="total", - type_output_dir=None, + wikidata_dump_type=["total"], wikidata_dump_path=None, ) return if isinstance(wikidata_dump, str): # if user provided a wikidata dump path - if all_bool: - language = "all" parse_wd_lexeme_dump( language=language, - wikidata_dump_type="total", - type_output_dir=None, + wikidata_dump_type=["total"], wikidata_dump_path=wikidata_dump, ) return diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 36f25877..163af4ae 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -706,16 +706,27 @@ def check_lexeme_dump_prompt_download(output_dir: str): return True -def check_index_exists(index_path: Path) -> bool: +def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: """ Check if JSON wiktionary dump file exists and prompt user for action if it does. + Returns True if user chooses to skip (i.e., we do NOT proceed). + Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed). + + Parameters: + index_path: Path to check + overwrite_all: If True, automatically overwrite without prompting """ if index_path.exists(): + if overwrite_all: + return False + print(f"\nIndex file already exists at: {index_path}") choice = questionary.select( "Choose an action:", choices=["Overwrite existing data", "Skip process"], default="Skip process", ).ask() + + # If user selects "Skip process", return True meaning "don't proceed" return choice == "Skip process" return False diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 7ab46f7a..5e29f2e1 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -23,9 +23,11 @@ from pathlib import Path from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper +from typing import List, Union from scribe_data.cli.download import wd_lexeme_dump_download_wrapper from scribe_data.wiktionary.parse_dump import parse_dump +from scribe_data.utils import language_metadata, data_type_metadata sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(JSON) @@ -33,8 +35,9 @@ def parse_wd_lexeme_dump( - language: str = None, - wikidata_dump_type: str = None, + language: Union[str, List[str]] = None, + wikidata_dump_type: List[str] = None, + data_types: List[str] = None, type_output_dir: str = None, wikidata_dump_path: str = None, ): @@ -43,18 +46,28 @@ def parse_wd_lexeme_dump( Parameters ---------- - language : str - The language to parse the data for. - wikidata_dump_type : str - The type of Wikidata dump to parse (e.g. "total", "translations"). - type_output_dir : str - The directory to save the parsed JSON data. - wikidata_dump_path : str + language : Union[str, List[str]] + The language(s) to parse the data for. Use "all" for all languages. + wikidata_dump_type : List[str] + The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]). + data_types : List[str] + The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]). + type_output_dir : str, optional + The directory to save the parsed JSON data. If None, uses default directory. + wikidata_dump_path : str, optional The local Wikidata dump directory that should be used to get data. - Returns - ------- - The requested data saved locally given file type and location arguments. """ + # Convert "all" to list of all languages + if isinstance(language, str) and language.lower() == "all": + language = list(language_metadata.keys()) + if isinstance(data_types, str) and data_types.lower() == "all": + # Exclude translations as it's a separate section + data_types = [ + dt + for dt in data_type_metadata.keys() + if dt != "translations" and dt != "emoji-keywords" + ] + file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) if isinstance(file_path, (str, Path)): @@ -67,10 +80,10 @@ def parse_wd_lexeme_dump( parse_dump( language=language, parse_type=wikidata_dump_type, - type_output_dir=type_output_dir, + data_types=data_types, file_path=file_path, + output_dir=type_output_dir, ) - return rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]") diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 1e15971f..36bbbc69 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -21,65 +21,130 @@ """ import bz2 -import json import time +import orjson from tqdm import tqdm from pathlib import Path from collections import defaultdict, Counter -from typing import Dict, Any +from typing import Union, List from scribe_data.utils import ( DEFAULT_DUMP_EXPORT_DIR, language_metadata, data_type_metadata, check_index_exists, ) +import questionary class LexemeProcessor: - def __init__(self, target_iso: str = None, parse_type: str = None): - # Pre-compute lookups once during initialization - self.word_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + def __init__( + self, + target_iso: Union[str, List[str]] = None, + parse_type: List[str] = None, + data_types: List[str] = None, + ): + """ + parse_type can be any combination of: + - 'translations' + - 'form' + - 'total' + data_types is a list of categories (e.g., ["nouns", "adverbs"]) for forms. + """ + # Pre-compute sets for faster lookups + self.parse_type = set(parse_type or []) + self.data_types = set(data_types or []) + self.target_iso = set( + [target_iso] if isinstance(target_iso, str) else target_iso or [] + ) + + # Pre-compute valid categories and languages + self._category_lookup = {v: k for k, v in data_type_metadata.items()} + self.valid_categories = set(data_type_metadata.values()) + + # Build optimized language mapping + self.iso_to_name = self._build_iso_mapping() + self.valid_iso_codes = set(self.iso_to_name.keys()) + + # Separate data structures + self.translations_index = defaultdict( + lambda: defaultdict(lambda: defaultdict(dict)) + ) + self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + + # Stats self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} - self.target_iso = target_iso - self.parse_type = parse_type + + # For category lookups, invert data_type_metadata + # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...} + self._category_lookup = {v: k for k, v in data_type_metadata.items()} + + # Build map from ISO to full language name + self.iso_to_name = self._build_iso_mapping() + + # For "total" usage self.lexical_category_counts = defaultdict(Counter) self.translation_counts = defaultdict(Counter) - self._category_lookup = {v: k for k, v in data_type_metadata.items() if v} - self.iso_to_name = self._build_iso_mapping() + self.forms_counts = defaultdict(Counter) + # MARK: build iso mapping def _build_iso_mapping(self) -> dict: - """Build mapping of ISO codes to language names""" + """ + Build mapping of ISO codes to language names based on language_metadata. + If self.target_iso is non-null, only include those iso codes. + """ iso_mapping = {} - for lang_name, data in language_metadata.items(): - if self.target_iso and lang_name != self.target_iso: + if self.target_iso and lang_name not in self.target_iso: continue + iso_code = data.get("iso") + if iso_code: + iso_mapping[iso_code] = lang_name + return iso_mapping - if "iso" in data: - iso_mapping[data["iso"]] = lang_name + # MARK: process total + def _process_lexeme_total(self, lexeme: dict) -> None: + """ + Gather stats if 'total' is in parse_type: how many entries per language & category, + how many translations, etc. + """ + lexicalCategory = lexeme.get("lexicalCategory") + if not lexicalCategory or lexicalCategory not in data_type_metadata.values(): + return - if not self.target_iso and "sub_languages" in data: - for sublang_data in data["sub_languages"].values(): - if "iso" in sublang_data: - iso_mapping[sublang_data["iso"]] = lang_name + category_name = self._category_lookup.get(lexicalCategory) + if not category_name: + return - return iso_mapping + # Update counters + lemmas = lexeme.get("lemmas", {}) + for lemma in lemmas.values(): + lang = lemma.get("language") + if lang in self.iso_to_name: + self.lexical_category_counts[lang][category_name] += 1 + translation_count = sum( + len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) + ) + self.translation_counts[lang][category_name] += translation_count + break - def _process_lexeme_translations(self, lexeme: dict) -> dict: - """Process lexeme translations from lemmas and senses""" + # MARK: process translations + def _process_lexeme_translations(self, lexeme: dict) -> None: + """ + Process gloss-based translations if 'translations' is in parse_type. + Store them in self.translations_index. + """ lemmas = lexeme.get("lemmas", {}) qid = lexeme.get("lexicalCategory") - # Early return if missing required data if not (lemmas and qid): - return {} - # Convert Qid to actual category name (e.g., Q1084 -> nouns) + return + category_name = self._category_lookup.get(qid) if not category_name: - return {} + return - # Process first valid lemma + # Only store first valid lemma for translations for lang_code, lemma_data in lemmas.items(): if lang_code not in self.iso_to_name: continue @@ -88,275 +153,431 @@ def _process_lexeme_translations(self, lexeme: dict) -> dict: if not word: continue - # Collect all valid translations in one pass + # Build translations from sense glosses translations = {} for sense in lexeme.get("senses", []): - for lang_code, gloss in sense.get("glosses", {}).items(): - if lang_code in self.iso_to_name: - translations[lang_code] = gloss["value"] + for sense_lang_code, gloss in sense.get("glosses", {}).items(): + if sense_lang_code in self.iso_to_name: + translations[sense_lang_code] = gloss["value"] if translations: - self.word_index[word][lang_code][category_name] = translations - return {word: {lang_code: {category_name: translations}}} - - break # Process only first valid lemma + self.translations_index[word][lang_code][category_name] = translations + break # Only handle the first lemma - return {} - - def _process_lexeme_total(self, lexeme: dict) -> Dict[str, Any]: + # MARK: process forms + def _process_lexeme_forms(self, lexeme: dict) -> None: """ - Process lexeme forms from lemmas, datatype and senses. - Returns a dictionary with word translations or empty dict if invalid. + Process forms for categories in self.data_types if 'form' is in parse_type. + Store them in self.forms_index. """ - - lexicalCategory = lexeme.get("lexicalCategory") - - # Skip if lexicalCategory is missing or not in our data types - if not lexicalCategory or lexicalCategory not in data_type_metadata.values(): - return {} lemmas = lexeme.get("lemmas", {}) + lexical_category = lexeme.get("lexicalCategory") - category_name = self._category_lookup.get(lexicalCategory) + # Skip if category missing or not recognized + if not lexical_category or lexical_category not in data_type_metadata.values(): + return + + # Convert Q1084 -> "nouns", etc. + category_name = self._category_lookup.get(lexical_category) if not category_name: - return {} + return - # Process only the first valid language entry - for lemma in lemmas.values(): - lang = lemma.get("language") - if lang in self.iso_to_name: - if lang not in self.lexical_category_counts: - self.lexical_category_counts[lang] = Counter() - self.translation_counts[lang] = Counter() - # Update counts - self.lexical_category_counts[lang][category_name] += 1 - translation_count = sum( - len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) - ) - self.translation_counts[lang][category_name] += translation_count - break + # If the category_name is NOT in our data_types list, skip + # e.g., category_name = "nouns", but user didn't request "nouns" in data_types + if category_name not in self.data_types: + return + + # Process forms + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.iso_to_name: + continue - return {} + word = lemma_data.get("value", "").lower() + if not word: + continue + + forms_data = defaultdict(list) + for form in lexeme.get("forms", []): + representations = form.get("representations", {}) + grammatical_features = form.get("grammaticalFeatures", []) - def process_lines(self, line: str) -> Dict[str, Any]: + for rep_lang, rep_data in representations.items(): + if rep_lang == lang_code: + form_value = rep_data.get("value") + if form_value: + forms_data[form_value].extend(grammatical_features) + + if forms_data: + self.forms_index[word][lang_code][category_name] = dict(forms_data) + self.forms_counts[lang_code][category_name] += len(forms_data) + break # only first valid lemma + + # MARK: process lines + def process_lines(self, line: str) -> None: """ - Process a single line of lexeme data. + Process one line of data. Depending on parse_type, we do: + - total stats + - translations + - form categories (filtered by data_types) """ try: - lexeme = json.loads(line.strip().rstrip(",")) + lexeme = orjson.loads(line.strip().rstrip(",")) + if not lexeme: + return + + # Get common values once + lemmas = lexeme.get("lemmas", {}) + lexical_category = lexeme.get("lexicalCategory") + + if not (lemmas and lexical_category in self.valid_categories): + return + + category_name = self._category_lookup.get(lexical_category) + if not category_name: + return + + # Process each type in a single pass through the data + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.valid_iso_codes: + continue + + word = lemma_data.get("value", "").lower() + if not word: + continue + + if "total" in self.parse_type: + self.lexical_category_counts[lang_code][category_name] += 1 + translation_count = sum( + len(sense.get("glosses", {})) + for sense in lexeme.get("senses", []) + ) + self.translation_counts[lang_code][category_name] += ( + translation_count + ) + + if "translations" in self.parse_type: + translations = { + lang: gloss["value"] + for sense in lexeme.get("senses", []) + for lang, gloss in sense.get("glosses", {}).items() + if lang in self.valid_iso_codes + } + if translations: + self.translations_index[word][lang_code][category_name] = ( + translations + ) - if self.parse_type == "translations": - return self._process_lexeme_translations(lexeme) - elif self.parse_type == "total": - return self._process_lexeme_total(lexeme) + if "form" in self.parse_type and category_name in self.data_types: + forms_data = defaultdict(list) + for form in lexeme.get("forms", []): + for rep_lang, rep_data in form.get( + "representations", {} + ).items(): + if rep_lang == lang_code: + form_value = rep_data.get("value") + if form_value: + forms_data[form_value].extend( + form.get("grammaticalFeatures", []) + ) + + if forms_data: + self.forms_index[word][lang_code][category_name] = dict( + forms_data + ) + self.forms_counts[lang_code][category_name] += len(forms_data) + + break # Only process first valid lemma except Exception as e: print(f"Error processing line: {e}") - return {} + # MARK: process file def process_file(self, file_path: str, batch_size: int = 50000): - start_time = time.time() - - try: - # Get file size and estimate number of entries (average 263 bytes per entry based on real data) + """ + Main loop: read lines from file (bz2) in batches, call process_lines on each. + """ + # Use context manager for better resource handling + with bz2.open(file_path, "rt", encoding="utf-8") as bzfile: + # Skip header if present + first_line = bzfile.readline() + if not first_line.strip().startswith("["): + bzfile.seek(0) + + # Process in larger batches for better performance + batch = [] + start_time = time.time() total_entries = int(Path(file_path).stat().st_size / 263) - with bz2.open(file_path, "rt", encoding="utf-8") as bzfile: - first_line = bzfile.readline() - if not first_line.strip().startswith("["): - bzfile.seek(0) - - batch = [] - for line in tqdm( - bzfile, desc="Processing entries", total=total_entries - ): - stripped_line = line.strip() - if stripped_line in [ - "]", - "[", - ",", - "", - ]: # Skip structural JSON elements - continue - + for line in tqdm(bzfile, total=total_entries, desc="Processing entries"): + if line.strip() not in ["[", "]", ",", ""]: batch.append(line) - if len(batch) >= batch_size: self._process_batch(batch) - batch = [] - + batch.clear() # More efficient than creating new list self.stats["processed_entries"] += 1 - # Process remaining items - if batch: - self._process_batch(batch) + # Process remaining items + if batch: + self._process_batch(batch) - self.stats["processing_time"] = time.time() - start_time - self.stats["unique_words"] = len(self.word_index) - print( - f"Processed {self.stats['processed_entries']:,} entries in {self.stats['processing_time']:.2f} seconds" - ) - if self.parse_type == "total": - print( - f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}" - ) - print("=" * 90) - - # Print counts for each language - for lang, counts in self.lexical_category_counts.items(): - lang_name = self.iso_to_name[lang] - # Print first row with language name - first_category = True - for category, count in counts.most_common(): - translation_count = self.translation_counts[lang][category] - if first_category: - print( - f"{lang_name:<20} {category:<25} {count:<25,} {translation_count:<20,}" - ) - first_category = False - else: - # Print subsequent rows with blank language column - print( - f"{'':<20} {category:<25} {count:<25,} {translation_count:<20,}" - ) - # Add blank line between languages, but not after the last language - if lang != list(self.lexical_category_counts.keys())[-1]: - print( - f"\n{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25} {'Total Translations':<20}" - ) - print("=" * 90) + # Update stats + self.stats["processing_time"] = time.time() - start_time + self.stats["unique_words"] = len(self.forms_index) + len( + self.translations_index + ) - except FileNotFoundError: - print(f"Error: File not found - {file_path}") - except Exception as e: - print(f"Error processing file: {e}") + # Print summary if "total" was requested + if "total" in self.parse_type: + self._print_total_summary() def _process_batch(self, batch: list) -> None: """ - Process multiple lines at once + Process a batch of lines """ for line in batch: self.process_lines(line) - def export_json(self, filepath: str, language_iso: str = None) -> None: + # MARK: print total summary + def _print_total_summary(self): """ - Save index to file, optionally filtering by language ISO code. + Print stats if parse_type == total + """ + print( + f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25} {'Total Translations':<20}" + ) + print("=" * 90) + for lang, counts in self.lexical_category_counts.items(): + lang_name = self.iso_to_name[lang] + first_row = True + for category, count in counts.most_common(): + trans_count = self.translation_counts[lang][category] + if first_row: + print( + f"{lang_name:<20} {category:<25} {count:<25,} {trans_count:<20,}" + ) + first_row = False + else: + print(f"{'':<20} {category:<25} {count:<25,} {trans_count:<20,}") + if lang != list(self.lexical_category_counts.keys())[-1]: + print("\n" + "=" * 90 + "\n") + + # MARK: export translations + def export_translations_json(self, filepath: str, language_iso: str = None) -> None: + """ + Save translations_index to file, optionally filtering by language_iso. """ if language_iso: - # Only proceed if we have a valid ISO code if language_iso not in self.iso_to_name: - print(f"Warning: Unknown ISO code {language_iso}, skipping...") + print( + f"Warning: ISO {language_iso} unknown, skipping translations export..." + ) return + # Filter + filtered = {} + for word, lang_data in self.translations_index.items(): + if language_iso in lang_data: + filtered[word] = {language_iso: lang_data[language_iso]} + + self._save_by_language(filtered, filepath, language_iso, "translations") - # Get full language name - full_language_name = self.iso_to_name[language_iso] + # MARK: export forms + def export_forms_json( + self, filepath: str, language_iso: str = None, data_type: str = None + ) -> None: + """ + Save forms_index to file, optionally filtering by: + - language_iso + - data_type (e.g. "nouns", "adverbs") - # Filter word_index for specific language - filtered_index = {} - for word, lang_data in self.word_index.items(): + If data_type is given, we only export that one category from forms. + """ + if language_iso: + if language_iso not in self.iso_to_name: + print(f"Warning: ISO {language_iso} unknown, skipping forms export...") + return + filtered = {} + for word, lang_data in self.forms_index.items(): if language_iso in lang_data: - filtered_index[word] = {language_iso: lang_data[language_iso]} - - # Create language-specific filepath, removing potential double paths - base_path = Path(filepath) - # Remove language name from base_path if it exists to prevent duplication - if full_language_name in base_path.parts: - parts = [p for p in base_path.parts if p != full_language_name] - base_path = Path(*parts) - - lang_filepath = base_path.parent / full_language_name / base_path.name - lang_filepath.parent.mkdir(parents=True, exist_ok=True) - - print(f"Saving {full_language_name} index to {lang_filepath}...") - with open(lang_filepath, "w", encoding="utf-8") as f: - json.dump(filtered_index, f, indent=2, ensure_ascii=False) - else: - print(f"Saving complete index to {filepath}...") - with open(filepath, "w", encoding="utf-8") as f: - json.dump( - self._convert_defaultdict_to_dict(self.word_index), - f, - indent=2, - ensure_ascii=False, + # If data_type is given, only keep that category + if data_type: + if data_type in lang_data[language_iso]: + filtered[word] = { + language_iso: { + data_type: lang_data[language_iso][data_type] + } + } + else: + filtered[word] = {language_iso: lang_data[language_iso]} + self._save_by_language( + filtered, filepath, language_iso, data_type or "forms" + ) + + def _save_by_language(self, data, filepath, language_iso, category_type): + """ + Save data to exports//filename + """ + base_path = Path(filepath) + lang_name = self.iso_to_name[language_iso] + + lang_filepath = base_path.parent / lang_name / base_path.name + lang_filepath.parent.mkdir(parents=True, exist_ok=True) + + print(f"Saving {lang_name} {category_type} index to {lang_filepath}...") + with open(lang_filepath, "wb") as f: + f.write( + orjson.dumps( + self._to_dict(data), + option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS, ) + ) - def _convert_defaultdict_to_dict(self, dd): + def _to_dict(self, dd): + """ + Recursively convert defaultdict to dict. + """ if isinstance(dd, defaultdict): - dd = {k: self._convert_defaultdict_to_dict(v) for k, v in dd.items()} + dd = {k: self._to_dict(v) for k, v in dd.items()} return dd +# MARK: parse dump def parse_dump( - language: str = None, - parse_type: str = None, - type_output_dir: str = DEFAULT_DUMP_EXPORT_DIR, + language: Union[str, List[str]] = None, + parse_type: List[str] = None, + data_types: List[str] = None, file_path: str = "latest-lexemes.json.bz2", + output_dir: str = None, + overwrite_all: bool = False, ): """ - Process and parse Wikidata lexeme dumps, either analyzing all - or filtering for a specific language. + Parse a Wikidata lexeme dump file and extract linguistic data. Parameters ---------- - language : str, - ISO code of the language to process. If 'all', processes all languages. - parse_type : str - Type of parsing to perform. Options are: - - 'total': Generate statistics about lexeme counts - - 'translations': Create translation indexes - type_output_dir : str - Directory where output files will be saved. Defaults to DEFAULT_DUMP_EXPORT_DIR. - file_path : str - Path to the lexeme dump file. Defaults to 'latest-lexemes.json.bz2'. + language : str or list of str, optional + Language(s) to parse data for. Must match language names in language_metadata. + parse_type : list of str, optional + Types of parsing to perform. Valid options are: + - 'translations': Extract word translations + - 'form': Extract grammatical forms + - 'total': Gather statistical totals + data_types : list of str, optional + Categories to parse when using 'form' type (e.g. ["nouns", "adverbs"]). + Only used if 'form' is in parse_type. + file_path : str, default="latest-lexemes.json.bz2" + Path to the lexeme dump file + output_dir : str, optional + Directory to save output files. If None, uses DEFAULT_DUMP_EXPORT_DIR. + overwrite_all : bool, default=False + If True, automatically overwrite existing files without prompting Notes ----- - When parse_type is 'total': - - Total number of lexemes per language along with different lexical categories - - Number of total translations available - - When parse_type is 'translations', it creates JSON index files containing: - - Word-to-translation mappings - - Lexical category information + The function processes a Wikidata lexeme dump and extracts linguistic data based on + the specified parameters. For each language and data type combination, it creates + separate JSON files in the output directory structure: + If a requested index file already exists, that language/category combination + will be skipped. """ - if parse_type == "total": - if language == "all": - print("Processing all lexemes...") - processor = LexemeProcessor(target_iso=None, parse_type=parse_type) - else: - print(f"Processing lexemes for {language}...") - processor = LexemeProcessor(target_iso=language, parse_type=parse_type) - - processor.process_file(file_path) - - else: - # Create the output directory if it doesn't exist - Path(type_output_dir).mkdir(parents=True, exist_ok=True) - - if language: - index_path = Path(type_output_dir) / language / f"lexeme_{parse_type}.json" - if check_index_exists(index_path): - return - else: - index_path = Path(type_output_dir) / f"lexeme_{parse_type}.json" - if check_index_exists(index_path): - return - - print(f"Will save index to: {index_path}") - - processor = LexemeProcessor(target_iso=language, parse_type=parse_type) - - print("Processing the lexeme data file...") - processor.process_file(file_path) - - print(f"Found {len(processor.word_index)} words in total") - - # Get unique ISO codes from the processed data + # 1) Prepare environment - Use default if output_dir is None + output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Convert single strings to lists + languages = [language] if isinstance(language, str) else language + parse_type = parse_type or [] + data_types = data_types or [] + + print(f"Languages: {languages}") + print(f"parse_type: {parse_type}") + if data_types: + print(f"data_types for forms: {data_types}") + + if "total" not in parse_type: + choice = questionary.select( + "Choose an action:", + choices=["Overwrite existing data", "Skip process"], + default="Skip process", + ).ask() + if choice == "Overwrite existing data": + overwrite_all = True + + # For translations, we only need to check the translations index + if "translations" in parse_type: + languages_to_process = [] + for lang in languages: + index_path = Path(output_dir) / lang / "lexeme_translations.json" + if not check_index_exists(index_path, overwrite_all): + languages_to_process.append(lang) + else: + print(f"Skipping {lang}/translations.json - already exists") + + # Update languages list but keep data_types as is + languages = languages_to_process + + # For forms, check each language/data_type combination + elif "form" in parse_type: + languages_to_process = [] + data_types_to_process = set() + + for lang in languages: + needs_processing = False + for data_type in data_types: + index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json" + if not check_index_exists(index_path, overwrite_all): + needs_processing = True + data_types_to_process.add(data_type) + else: + print(f"Skipping {lang}/{data_type}.json - already exists") + + if needs_processing: + languages_to_process.append(lang) + + # Update both lists + languages = languages_to_process + data_types = list(data_types_to_process) + + print(f"Languages to process: {languages}") + if data_types: + print(f"Data types to process: {data_types}") + + if not languages: + print("All requested data already exists. Nothing to process.") + return + + processor = LexemeProcessor( + target_iso=languages, parse_type=parse_type, data_types=data_types + ) + processor.process_file(file_path) + + # MARK: Handle JSON exports + + # (a) If "translations" in parse_type -> export them + if "translations" in parse_type: + index_path = Path(output_dir) / "lexeme_translations.json" + + # Export translations for each ISO found iso_codes = set() - for word_data in processor.word_index.values(): + for word_data in processor.translations_index.values(): iso_codes.update(word_data.keys()) - - # Save individual files for each valid language for iso_code in iso_codes: - if iso_code in processor.iso_to_name: # Only process known ISO codes - processor.export_json(str(index_path), iso_code) + if iso_code in processor.iso_to_name: + processor.export_translations_json(str(index_path), iso_code) + + # (b) If "form" in parse_type -> export forms for each data_type in data_types + if "form" in parse_type: + # For each data_type, we create a separate file, e.g. lexeme_nouns.json + for dt in data_types: + index_path = Path(output_dir) / f"lexeme_{dt}.json" + print(f"Exporting forms for {dt} to {index_path}...") + + iso_codes = set() + for word_data in processor.forms_index.values(): + iso_codes.update(word_data.keys()) + + for iso_code in iso_codes: + if iso_code in processor.iso_to_name: + processor.export_forms_json( + filepath=str(index_path), language_iso=iso_code, data_type=dt + ) diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 8cf75090..914fbe9e 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -62,37 +62,37 @@ def test_invalid_arguments(self): # MARK: All Data - @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_data_types_for_language(self, mock_query_data): - """ - Test retrieving all data types for a specific language. - - Ensures that `query_data` is called properly when `--all` flag is used with a language. - """ - get_data(all_bool=True, language="English") - mock_query_data.assert_called_once_with( - languages=["English"], - data_type=None, - output_dir="scribe_data_json_export", - overwrite=False, - ) - - @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_languages_for_data_type(self, mock_query_data): - """ - Test retrieving all languages for a specific data type. - - Ensures that `query_data` is called properly when `--all` flag is used with a data type. - """ - get_data(all_bool=True, data_type="nouns") - mock_query_data.assert_called_once_with( - languages=None, - data_type=["nouns"], - output_dir="scribe_data_json_export", - overwrite=False, - ) + # @patch("scribe_data.cli.get.query_data") + # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False) + # def test_get_all_data_types_for_language(self, mock_prompt, mock_query_data): + # """ + # Test retrieving all data types for a specific language. + + # Ensures that `query_data` is called properly when `--all` flag is used with a language. + # """ + # get_data(all_bool=True, language="English") + # mock_query_data.assert_called_once_with( + # languages=["English"], + # data_type=None, + # output_dir="scribe_data_json_export", + # overwrite=False, + # ) + + # @patch("scribe_data.cli.get.query_data") + # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False) + # def test_get_all_languages_for_data_type(self, mock_prompt, mock_query_data): + # """ + # Test retrieving all languages for a specific data type. + + # Ensures that `query_data` is called properly when `--all` flag is used with a data type. + # """ + # get_data(all_bool=True, data_type="nouns") + # mock_query_data.assert_called_once_with( + # languages=None, + # data_type=["nouns"], + # output_dir="scribe_data_json_export", + # overwrite=False, + # ) # MARK: Language and Data Type @@ -115,7 +115,8 @@ def test_get_specific_language_and_data_type(self, mock_query_data): # MARK: Capitalized Language @patch("scribe_data.cli.get.query_data") - def test_get_data_with_capitalized_language(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_capitalized_language(self, mock_glob, mock_query_data): """ Test retrieving data with a capitalized language. @@ -133,7 +134,8 @@ def test_get_data_with_capitalized_language(self, mock_query_data): # MARK: Lowercase Language @patch("scribe_data.cli.get.query_data") - def test_get_data_with_lowercase_language(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_lowercase_language(self, mock_glob, mock_query_data): """ Test retrieving data with a lowercase language. @@ -171,7 +173,8 @@ def test_get_data_with_different_output_directory(self, mock_query_data): # MARK: Overwrite is True @patch("scribe_data.cli.get.query_data") - def test_get_data_with_overwrite_true(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_overwrite_true(self, mock_glob, mock_query_data): """ Test retrieving data with the overwrite flag set to True. From 155647bc8a7055a9bac9a873175ca4928599b55d Mon Sep 17 00:00:00 2001 From: axif Date: Mon, 30 Dec 2024 01:55:20 +0600 Subject: [PATCH 09/13] Set default language to "all" for translations in get_data function --- src/scribe_data/cli/get.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 118badb0..a7c92afb 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -177,6 +177,8 @@ def prompt_user_download_all(): # MARK: Translations elif data_type == "translations": + if language is None: + language = "all" parse_wd_lexeme_dump( language=language, wikidata_dump_type=["translations"], From 26aa1925c857d8a074526c472a115054da309f3f Mon Sep 17 00:00:00 2001 From: axif Date: Mon, 30 Dec 2024 02:01:31 +0600 Subject: [PATCH 10/13] Removed extra welcome message from interactive mode --- src/scribe_data/cli/interactive.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 7a0beb2d..f48b952f 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -37,7 +37,6 @@ # from scribe_data.cli.list import list_wrapper from scribe_data.cli.get import get_data from scribe_data.cli.total import total_wrapper -from scribe_data.cli.version import get_local_version from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, @@ -331,7 +330,6 @@ def start_interactive_mode(operation: str = None): operation : str The type of operation that interactive mode is being ran with. """ - rprint(f"[bold cyan]Welcome to {get_local_version()} interactive mode![/bold cyan]") while True: # Check if both selected_languages and selected_data_types are empty. if not config.selected_languages and not config.selected_data_types: From 402493bebd5c6c2d929a167b0597edb006f6f69a Mon Sep 17 00:00:00 2001 From: axif Date: Sat, 4 Jan 2025 00:52:33 +0600 Subject: [PATCH 11/13] Add MediaWiki translation parsing functionality --- src/scribe_data/cli/main.py | 8 +- src/scribe_data/wikidata/wikidata_utils.py | 23 +++ src/scribe_data/wiktionary/parse_mediaWiki.py | 134 ++++++++++++++++++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 src/scribe_data/wiktionary/parse_mediaWiki.py diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 5a4818f2..8baa7873 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -37,6 +37,8 @@ from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message +from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations + LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." GET_DESCRIPTION = ( "Get data from Wikidata and other sources for the given languages and data types." @@ -168,6 +170,9 @@ def main() -> None: type=str, help="Path to a local Wikidata lexemes dump for running with '--all'.", ) + get_parser.add_argument( + "-t", "--translation", type=str, help="parse a single word using MediaWiki API" + ) # MARK: Total @@ -359,7 +364,8 @@ def main() -> None: elif args.command in ["get", "g"]: if args.interactive: start_interactive_mode(operation="get") - + if args.translation: + parse_wiktionary_translations(args.translation) else: get_data( language=args.language.lower() diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 5e29f2e1..14e5fc02 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -24,6 +24,7 @@ from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper from typing import List, Union +import requests from scribe_data.cli.download import wd_lexeme_dump_download_wrapper from scribe_data.wiktionary.parse_dump import parse_dump @@ -34,6 +35,28 @@ sparql.setMethod(POST) +def mediaWiki_query(query: str) -> dict: + """ + Query the Wikidata API using a MediaWiki query. + + Parameters + ---------- + query : str + The MediaWiki query to execute. + + Returns + ------- + dict + The JSON response from the API. + """ + url = ( + f"https://en.wiktionary.org/w/api.php?" + f"action=query&format=json&titles={query}/translations&prop=revisions&rvprop=content" + ) + response = requests.get(url) + return response.json() + + def parse_wd_lexeme_dump( language: Union[str, List[str]] = None, wikidata_dump_type: List[str] = None, diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py new file mode 100644 index 00000000..5faa486a --- /dev/null +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -0,0 +1,134 @@ +""" + Functions to parse the translations of a word from MediaWiki API. + +.. raw:: html + +""" + +import re +import json +from scribe_data.wikidata.wikidata_utils import mediaWiki_query +from scribe_data.utils import get_language_from_iso + + +def fetch_translation_page(word): + data = mediaWiki_query(word) + + pages = data.get("query", {}).get("pages", {}) + # Extract page object from dictionary + page = next(iter(pages.values())) if pages else {} + # Get the wikitext from the 'revisions' key + wikitext = page.get("revisions", [{}])[0].get("*", "") + return wikitext + + +def parse_wikitext_for_translations(wikitext): + """ + Parse the wikitext line by line to extract translations, + language codes, part of speech, and context. + """ + translations_by_lang = {} + current_part_of_speech = None # Track whether we are in Noun or Verb + current_context = None # Track the current trans-top context + + # Split the wikitext into individual lines + for line in wikitext.splitlines(): + # Detect part of speech/data-types: Noun or Verb + if line.startswith("===Noun==="): + current_part_of_speech = "Noun" + elif line.startswith("===Verb==="): + current_part_of_speech = "Verb" + trans_top_match = re.match(r"\{\{trans-top\|(.+?)\}\}", line) + if trans_top_match: + current_context = trans_top_match.group(1).strip() + + template_match = re.match( + r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", line.strip() + ) + if template_match: + lang_code = template_match.group(2).strip() + translation_text = template_match.group(3).strip() + + # Ensure there's a list to hold translations for this language + if lang_code not in translations_by_lang: + translations_by_lang[lang_code] = [] + + translations_by_lang[lang_code].append( + { + "translation": translation_text, + "part_of_speech": current_part_of_speech, + "context": current_context, + } + ) + + return translations_by_lang + + +def build_json_format(word, translations_by_lang): + """ + Build the final JSON format for the translations of a word. + """ + book_translations = {word: {}} + # Keep counters to number the translations for each (lang, part_of_speech) + language_counters = {} + + for lang_code, entries in translations_by_lang.items(): + try: + lang_name = get_language_from_iso(lang_code) + except ValueError: + # Skip this language if it's not supported + continue + + # Make sure this language is in the dictionary + if lang_name not in book_translations[word]: + book_translations[word][lang_name] = {} + + for item in entries: + pos = item["part_of_speech"] or "Unknown" + desc = item["context"] + trans = item["translation"] + + if pos not in book_translations[word][lang_name]: + book_translations[word][lang_name][pos] = {} + language_counters[(lang_code, pos)] = 1 + + idx = str(language_counters[(lang_code, pos)]) + + # Insert the item at the next available index + book_translations[word][lang_name][pos][idx] = { + "description": desc, + "translations": trans, + } + language_counters[(lang_code, pos)] += 1 + + return book_translations + + +def parse_wiktionary_translations(word): + """ + Parse the translations of a word from Wiktionary. + """ + wikitext = fetch_translation_page(word) + translations_by_lang = parse_wikitext_for_translations(wikitext) + + if not translations_by_lang: + print("No translations found") + return + + final_json = build_json_format(word, translations_by_lang) + print(json.dumps(final_json, indent=4, ensure_ascii=False)) From 94b060a33ecfa184ea49b8b1e190d9892bf439e3 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 4 Jan 2025 14:00:07 +0100 Subject: [PATCH 12/13] Formatting for all doc strings, spacing and minor improvements --- CONTRIBUTING.md | 13 +- .../check/check_project_metadata.py | 26 +- .../check/check_project_structure.py | 22 +- src/scribe_data/check/check_pyicu.py | 29 ++- src/scribe_data/check/check_query_forms.py | 98 ++++---- .../check/check_query_identifiers.py | 56 ++--- src/scribe_data/cli/cli_utils.py | 34 +-- src/scribe_data/cli/convert.py | 92 +++---- src/scribe_data/cli/download.py | 57 +++-- src/scribe_data/cli/get.py | 48 ++-- src/scribe_data/cli/interactive.py | 45 ++-- src/scribe_data/cli/list.py | 20 +- src/scribe_data/cli/main.py | 18 +- src/scribe_data/cli/total.py | 61 +++-- .../unicode/generate_emoji_keywords.py | 13 +- src/scribe_data/unicode/process_unicode.py | 10 +- src/scribe_data/utils.py | 238 +++++++++--------- src/scribe_data/wikidata/check_query/check.py | 65 ++--- .../wikidata/check_query/sparql.py | 26 +- src/scribe_data/wikidata/format_data.py | 14 +- src/scribe_data/wikidata/query_data.py | 32 +-- src/scribe_data/wikidata/wikidata_utils.py | 17 +- src/scribe_data/wikipedia/extract_wiki.py | 86 +++---- src/scribe_data/wikipedia/process_wiki.py | 50 ++-- src/scribe_data/wiktionary/parse_dump.py | 154 +++++++----- src/scribe_data/wiktionary/parse_mediaWiki.py | 54 ++-- tests/cli/test_download.py | 10 +- tests/wikidata/test_check_query.py | 1 + 28 files changed, 725 insertions(+), 664 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 924927ec..f929c112 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -300,13 +300,18 @@ def example_function(argument: argument_type) -> return_type: Parameters ---------- - argument: argument_type - Description of your argument. + argument: argument_type + Description of your argument. Returns ------- - return_value : return_type - Description of your return value. + return_value : return_type + Description of your return value. + + Raises + ------ + ErrorType + Description of the error and the condition that raises it. """ ... diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py index 84523ba2..159d8ca2 100644 --- a/src/scribe_data/check/check_project_metadata.py +++ b/src/scribe_data/check/check_project_metadata.py @@ -88,16 +88,16 @@ def get_missing_languages( Parameters ---------- - reference_languages : dict - A dictionary of languages from the reference source. + reference_languages : dict + A dictionary of languages from the reference source. - target_languages : dict - A dictionary of languages from the target source to check for missing entries. + target_languages : dict + A dictionary of languages from the target source to check for missing entries. Returns ------- - list[str] - A list of languages and sub-languages that are in target_languages but not in reference_languages. + list[str] + A list of languages and sub-languages that are in target_languages but not in reference_languages. """ missing_languages = [] reference_keys = reference_languages.keys() @@ -130,17 +130,17 @@ def validate_language_properties(languages_dict: dict) -> dict: Parameters ---------- - languages_dict : dict - A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key. + languages_dict : dict + A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key. Returns ------- - dict: A dictionary with two lists: - - "missing_qids": Languages or sub-languages missing the 'qid' property. - - "missing_isos": Languages or sub-languages missing the 'iso' property. + dict: A dictionary with two lists: + - "missing_qids": Languages or sub-languages missing the 'qid' property. + - "missing_isos": Languages or sub-languages missing the 'iso' property. - Each entry in these lists is in the format "parent_language - sub_language" for sub-languages, - or simply "parent_language" for the parent languages. + Each entry in these lists is in the format "parent_language - sub_language" for sub-languages, + or simply "parent_language" for the parent languages. """ missing_qids = [] missing_isos = [] diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py index 612299df..edfe9085 100644 --- a/src/scribe_data/check/check_project_structure.py +++ b/src/scribe_data/check/check_project_structure.py @@ -48,24 +48,24 @@ def check_for_sparql_files(folder_path, data_type, language, subdir, missing_que Parameters ---------- - folder_path : str - The path to the data-type folder. + folder_path : str + The path to the data-type folder. - data_type : str - The name of the data type being checked. + data_type : str + The name of the data type being checked. - language : str - The name of the language being processed. + language : str + The name of the language being processed. - subdir : str or None - The name of the sub-directory (for languages with sub-dialects), or None. + subdir : str or None + The name of the sub-directory (for languages with sub-dialects), or None. - missing_queries : list - A list to which missing SPARQL query files will be appended. + missing_queries : list + A list to which missing SPARQL query files will be appended. Returns ------- - bool: True if at least one .sparql file is found, False otherwise. + bool: True if at least one .sparql file is found, False otherwise. """ sparql_files = [f for f in os.listdir(folder_path) if f.endswith(".sparql")] diff --git a/src/scribe_data/check/check_pyicu.py b/src/scribe_data/check/check_pyicu.py index a2d645ce..456d4ed8 100644 --- a/src/scribe_data/check/check_pyicu.py +++ b/src/scribe_data/check/check_pyicu.py @@ -27,8 +27,8 @@ from pathlib import Path import pkg_resources +import questionary import requests -from questionary import confirm def check_if_pyicu_installed(): @@ -90,15 +90,15 @@ def download_wheel_file(wheel_url, output_dir): Parameters ---------- - wheel_url : str - The URL of the wheel file to download. + wheel_url : str + The URL of the wheel file to download. - output_dir : str - The directory to save the downloaded file. + output_dir : str + The directory to save the downloaded file. Returns ------- - str : path to the downloaded wheel file. + str : path to the downloaded wheel file. """ response = requests.get(wheel_url) response.raise_for_status() # raise an error for bad responses @@ -118,18 +118,18 @@ def find_matching_wheel(wheels, python_version, architecture): Parameters ---------- - wheels : list - The list of available wheels. + wheels : list + The list of available wheels. - python_version : str - The Python version (e.g., 'cp311'). + python_version : str + The Python version (e.g., 'cp311'). - architecture : str - The architecture type (e.g., 'win_amd64'). + architecture : str + The architecture type (e.g., 'win_amd64'). Returns ------- - str : The download URL of the matching wheel or None if not found. + str : The download URL of the matching wheel or None if not found. """ return next( ( @@ -148,8 +148,7 @@ def check_and_install_pyicu(): # Fetch available wheels from GitHub to estimate download size. wheels, total_size_mb = fetch_wheel_releases() - # Use questionary to ask for user confirmation - user_wants_to_proceed = confirm( + user_wants_to_proceed = questionary.confirm( f"{package_name} is not installed.\nScribe-Data can install the package and the needed dependencies." f"\nApproximately {total_size_mb:.2f} MB will be downloaded.\nDo you want to proceed?" ).ask() diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index 12c4d96d..5435c844 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -57,18 +57,18 @@ def extract_forms_from_sparql(file_path: Path) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract forms. + file_path : Path + The path to the SPARQL query file from which to extract forms. Returns ------- - query_form_dict : dict - The file path with form labels of the query and their respective QIDs. + query_form_dict : dict + The file path with form labels of the query and their respective QIDs. Raises ------ - FileNotFoundError - If the specified file does not exist. + FileNotFoundError + If the specified file does not exist. """ optional_pattern = r"\s\sOPTIONAL\s*\{([^}]*)\}" try: @@ -95,13 +95,13 @@ def extract_form_rep_label(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - str - The label of the form representation. + str + The label of the form representation. """ onto_rep_pattern = r"ontolex:representation .* ;" if line_match := re.search(pattern=onto_rep_pattern, string=form_text): @@ -119,13 +119,13 @@ def decompose_label_features(label): Parameters ---------- - label : str - The concatenated label string composed of several grammatical features. + label : str + The concatenated label string composed of several grammatical features. Returns ------- - list - A list of grammatical features extracted from the label in their original order. + list + A list of grammatical features extracted from the label in their original order. """ components = re.findall(r"[A-Za-z][^A-Z]*", label) valid_components = [] @@ -157,13 +157,13 @@ def extract_form_qids(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - list[str] - All QIDS that make up the form. + list[str] + All QIDS that make up the form. """ qids_pattern = r"wikibase:grammaticalFeature .+ \." if match := re.search(pattern=qids_pattern, string=form_text): @@ -179,13 +179,13 @@ def check_form_label(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - bool - Whether the form and its current representation label match (repForm and rep). + bool + Whether the form and its current representation label match (repForm and rep). """ form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \." @@ -221,13 +221,13 @@ def check_query_formatting(form_text: str): Parameters ---------- - query_text : str - The SPARQL query text to check. + query_text : str + The SPARQL query text to check. Returns ------- - bool - Whether there are formatting errors with the query. + bool + Whether there are formatting errors with the query. """ # Check for spaces before commas that should not exist. if re.search(r"\s,", form_text): @@ -249,13 +249,13 @@ def return_correct_form_label(qids: list): Parameters ---------- - qids : list[str] - All QIDS that make up the form. + qids : list[str] + All QIDS that make up the form. Returns ------- - correct_label : str - The label for the representation given the QIDs. + correct_label : str + The label for the representation given the QIDs. """ if not qids: return "Invalid query formatting found" @@ -289,14 +289,14 @@ def validate_forms(query_text: str) -> str: Parameters ---------- - query_file : str - The SPARQL query text as a string. + query_file : str + The SPARQL query text as a string. Returns ------- - str - Error message if there are any issues with the order of variables or forms, - otherwise an empty string. + str + Error message if there are any issues with the order of variables or forms, + otherwise an empty string. """ select_pattern = r"SELECT\s+(.*?)\s+WHERE" @@ -376,13 +376,13 @@ def check_docstring(query_text: str) -> bool: Parameters ---------- - query_text : str - The SPARQL query's text to be checked. + query_text : str + The SPARQL query's text to be checked. Returns ------- - bool - True if the docstring is correctly formatted. + bool + True if the docstring is correctly formatted. """ # Split the text into lines. query_lines = query_text.splitlines(keepends=True) @@ -418,14 +418,14 @@ def check_forms_order(query_text): Parameters ---------- - query_text : str - The SPARQL query text containing the SELECT statement with variables. + query_text : str + The SPARQL query text containing the SELECT statement with variables. Returns ------- - list or bool - A sorted list of variables if the ordering differs from the original, - otherwise a boolean indicating that the order matches. + list or bool + A sorted list of variables if the ordering differs from the original, + otherwise a boolean indicating that the order matches. """ select_pattern = r"SELECT\s+(.*?)\s+WHERE" @@ -496,14 +496,14 @@ def check_optional_qid_order(query_file: str) -> str: Parameters ---------- - query_file : str - The path to the SPARQL query file to be checked. + query_file : str + The path to the SPARQL query file to be checked. Returns ------- - str - A formatted string with details on any order mismatches in the QIDs, or an empty - string if all QIDs are correctly ordered. + str + A formatted string with details on any order mismatches in the QIDs, or an empty + string if all QIDs are correctly ordered. """ forms = extract_forms_from_sparql(query_file) error_messages = [] diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 3d1779e7..5337b972 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -41,21 +41,21 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: Parameters ---------- - query_file : Path - The path to the SPARQL query file being validated. + query_file : Path + The path to the SPARQL query file being validated. - lang_qid : str - The QID of the language extracted from the SPARQL query. + lang_qid : str + The QID of the language extracted from the SPARQL query. Returns ------- - bool - True if the language QID is valid, otherwise False. + bool + True if the language QID is valid, otherwise False. - Example - ------- - > is_valid_language(Path("path/to/query.sparql"), "Q123456") - True + Examples + -------- + > is_valid_language(Path("path/to/query.sparql"), "Q123456") + True """ lang_directory_name = query_file.parent.parent.name.lower() language_entry = language_metadata.get(lang_directory_name) @@ -79,21 +79,21 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: Parameters ---------- - query_file : Path - The path to the SPARQL query file being validated. + query_file : Path + The path to the SPARQL query file being validated. - data_type_qid : str - The QID of the data type extracted from the SPARQL query. + data_type_qid : str + The QID of the data type extracted from the SPARQL query. Returns ------- - bool - True if the data type QID is valid, otherwise False. + bool + True if the data type QID is valid, otherwise False. - Example - ------- - > is_valid_data_type(Path("path/to/query.sparql"), "Q654321") - True + Examples + -------- + > is_valid_data_type(Path("path/to/query.sparql"), "Q654321") + True """ directory_name = query_file.parent.name # e.g., "nouns" or "verbs" expected_data_type_qid = data_type_metadata.get(directory_name) @@ -107,21 +107,21 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract the QID. + file_path : Path + The path to the SPARQL query file from which to extract the QID. - pattern : str - The regex pattern used to match the QID (either for language or data type). + pattern : str + The regex pattern used to match the QID (either for language or data type). Returns ------- - str - The extracted QID if found, otherwise None. + str + The extracted QID if found, otherwise None. Raises ------ - FileNotFoundError - If the specified file does not exist. + FileNotFoundError + If the specified file does not exist. """ try: with open(file_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 5c8cd14b..6fa46651 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -34,12 +34,12 @@ def correct_data_type(data_type: str) -> str: Parameters ---------- - data_type : str - The data type to potentially correct. + data_type : str + The data type to potentially correct. Returns ------- - The data_type value or a corrected version of it. + The data_type value or a corrected version of it. """ all_data_types = data_type_metadata.keys() @@ -122,16 +122,16 @@ def validate_language_and_data_type( Parameters ---------- - language : str or list - The language(s) to validate. + language : str or list + The language(s) to validate. - data_type : str or list - The data type(s) to validate. + data_type : str or list + The data type(s) to validate. Raises ------ - ValueError - If any of the languages or data types is invalid, with all errors reported together. + ValueError + If any of the languages or data types is invalid, with all errors reported together. """ def validate_single_item(item, valid_options, item_type): @@ -140,19 +140,19 @@ def validate_single_item(item, valid_options, item_type): Parameters ---------- - item : str - The item to validate. - valid_options : list + item : str + The item to validate. - A list of valid options against which the item will be validated. + valid_options : list + A list of valid options against which the item will be validated. - item_type : str - A description of the item type (e.g., "language", "data-type") used in error messages. + item_type : str + A description of the item type (e.g., "language", "data-type") used in error messages. Returns ------- - str or None - Returns an error message if the item is invalid, or None if the item is valid. + str or None + Returns an error message if the item is invalid, or None if the item is valid. """ if ( isinstance(item, str) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index a2e2f777..27518244 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -51,30 +51,30 @@ def convert_to_json( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : Union[str, List[str]] - The data type of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. - output_type : str - The output format, should be "json". + output_type : str + The output format, should be "json". - input_file : str - The input CSV/TSV file path. + input_file : str + The input CSV/TSV file path. - output_dir : Path - The output directory path for results. + output_dir : Path + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - None + None """ if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -205,30 +205,30 @@ def convert_to_csv_or_tsv( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : Union[str, List[str]] - The data type of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. - output_type : str - The output format, should be "csv" or "tsv". + output_type : str + The output format, should be "csv" or "tsv". - input_file : str - The input JSON file path. + input_file : str + The input JSON file path. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - None + None """ if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -391,30 +391,30 @@ def convert_to_sqlite( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : str - The data type of the file to convert. + data_type : str + The data type of the file to convert. - output_type : str - The output format, should be "sqlite". + output_type : str + The output format, should be "sqlite". - input_file : Path - The input file path for the data to be converted. + input_file : Path + The input file path for the data to be converted. - output_dir : Path - The output directory path for results. + output_dir : Path + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - A SQLite file saved in the given location. + A SQLite file saved in the given location. """ if input_file: input_file = Path(input_file) @@ -487,7 +487,7 @@ def convert_wrapper( Returns ------- - None + None """ output_type = output_type.lower() diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py index 4ce478e0..f7f29adf 100644 --- a/src/scribe_data/cli/download.py +++ b/src/scribe_data/cli/download.py @@ -27,10 +27,10 @@ from pathlib import Path from typing import Optional +import questionary import requests from rich import print as rprint from tqdm import tqdm -import questionary from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download @@ -46,13 +46,16 @@ def parse_date(date_string): Parameters ---------- - date_string : str - The date string to be parsed. + date_string : str + The date string to be parsed. Returns ------- - datetime.date : Parsed date object if the format is valid. - None : If the date format is invalid. + datetime.date + Parsed date object if the format is valid. + + None + If the date format is invalid. """ formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"] for fmt in formats: @@ -76,19 +79,22 @@ def available_closest_lexeme_dumpfile( Parameters ---------- - target_entity : str - The target date for which the dump is requested (format: YYYY/MM/DD or similar). + target_entity : str + The target date for which the dump is requested (format: YYYY/MM/DD or similar). - other_old_dumps : list - List of available dump folders as strings. + other_old_dumps : list + List of available dump folders as strings. - check_wd_dump_exists : function - A function to validate if the dump file exists. + check_wd_dump_exists : function + A function to validate if the dump file exists. Returns ------- - str : The closest available dump file date (as a string). - None : If no suitable dump is found. + str + The closest available dump file date (as a string). + + None + If no suitable dump is found. """ target_date = parse_date(target_entity) closest_date = None @@ -122,16 +128,19 @@ def download_wd_lexeme_dump(target_entity: str = "latest-lexemes"): Parameters ---------- - target_entity : str, optional - The target dump to download. Defaults to "latest-lexemes". + target_entity : str, optional + The target dump to download. Defaults to "latest-lexemes". - - If "latest-lexemes", downloads the latest dump. - - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date. + - If "latest-lexemes", downloads the latest dump. + - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date. Returns ------- - str : The URL of the requested or closest available dump. - None : If no suitable dump is found or the request fails. + str + The URL of the requested or closest available dump. + + None + If no suitable dump is found or the request fails. """ base_url = "https://dumps.wikimedia.org/wikidatawiki/entities" @@ -219,12 +228,12 @@ def wd_lexeme_dump_download_wrapper( Parameters ---------- - wikidata_dump : str - Optional date string in YYYYMMDD format for specific dumps. + wikidata_dump : str + Optional date string in YYYYMMDD format for specific dumps. - output_dir : str - Optional directory path for the downloaded file. - Defaults to 'scribe_data_wikidata_dumps_export' directory. + output_dir : str + Optional directory path for the downloaded file. + Defaults to 'scribe_data_wikidata_dumps_export' directory. """ dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes") diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index a7c92afb..3e2f3897 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -24,8 +24,8 @@ from pathlib import Path from typing import List, Union -from rich import print as rprint import questionary +from rich import print as rprint from scribe_data.cli.convert import convert_wrapper from scribe_data.unicode.generate_emoji_keywords import generate_emoji @@ -56,39 +56,39 @@ def get_data( Parameters ---------- - language : str - The language(s) to get. + language : str + The language(s) to get. - data_type : str - The data type(s) to get. + data_type : str + The data type(s) to get. - output_type : str - The output file type. + output_type : str + The output file type. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - outputs_per_entry : str - How many outputs should be generated per data entry. + outputs_per_entry : str + How many outputs should be generated per data entry. - overwrite : bool (default: False) - Whether to overwrite existing files. + overwrite : bool (default: False) + Whether to overwrite existing files. - all_bool : bool - Get all languages and data types. + all_bool : bool + Get all languages and data types. - interactive : bool (default: False) - Whether it's running in interactive mode. + interactive : bool (default: False) + Whether it's running in interactive mode. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". - wikidata_dump : str - The local Wikidata dump that can be used to process data. + wikidata_dump : str + The local Wikidata lexeme dump that can be used to process data. Returns ------- - The requested data saved locally given file type and location arguments. + The requested data saved locally given file type and location arguments. """ # MARK: Defaults @@ -159,7 +159,7 @@ def prompt_user_download_all(): else: print("Updating all languages and data types...") rprint( - "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]" + "[bold red]Note that the download all functionality must use Wikidata lexeme dumps to observe responsible Wikidata Query Service usage practices.[/bold red]" ) parse_wd_lexeme_dump( language="all", @@ -187,7 +187,7 @@ def prompt_user_download_all(): ) return - # MARK: Query Data using Wikidata Dump + # MARK: Form Dump elif wikidata_dump: parse_wd_lexeme_dump( diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index f48b952f..5e5dec74 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -27,7 +27,6 @@ import questionary from prompt_toolkit import prompt from prompt_toolkit.completion import WordCompleter -from questionary import Choice from rich import print as rprint from rich.console import Console from rich.logging import RichHandler @@ -37,14 +36,14 @@ # from scribe_data.cli.list import list_wrapper from scribe_data.cli.get import get_data from scribe_data.cli.total import total_wrapper -from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, DEFAULT_DUMP_EXPORT_DIR, + DEFAULT_JSON_EXPORT_DIR, data_type_metadata, language_metadata, list_all_languages, ) +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump # MARK: Config Setup @@ -261,10 +260,12 @@ def request_total_lexeme_loop(): choice = questionary.select( "What would you like to do?", choices=[ - Choice("Configure total lexemes request", "total"), - Choice("Run total lexemes request", "run"), - Choice("Run total lexemes request with lexeme dumps", "run_all"), - Choice("Exit", "exit"), + questionary.Choice("Configure total lexemes request", "total"), + questionary.Choice("Run total lexemes request", "run"), + questionary.Choice( + "Run total lexemes request with lexeme dumps", "run_all" + ), + questionary.Choice("Exit", "exit"), ], ).ask() @@ -303,7 +304,7 @@ def request_total_lexeme_loop(): # See list of languages. # """ -# choice = questionary.select( +# choice = select( # "What would you like to list?", # choices=[ # Choice("All languages", "all_languages"), @@ -327,42 +328,46 @@ def start_interactive_mode(operation: str = None): Parameters ---------- - operation : str - The type of operation that interactive mode is being ran with. + operation : str + The type of operation that interactive mode is being ran with. """ while True: # Check if both selected_languages and selected_data_types are empty. if not config.selected_languages and not config.selected_data_types: if operation == "get": choices = [ - Choice("Configure get data request", "configure"), + questionary.Choice("Configure get data request", "configure"), # Choice("See list of languages", "languages"), - Choice("Exit", "exit"), + questionary.Choice("Exit", "exit"), ] elif operation == "total": choices = [ - Choice("Configure total lexemes request", "total"), + questionary.Choice("Configure total lexemes request", "total"), # Choice("See list of languages", "languages"), - Choice("Exit", "exit"), + questionary.Choice("Exit", "exit"), ] elif operation == "translations": choices = [ - Choice("Configure translations request", "translations"), + questionary.Choice( + "Configure translations request", "translations" + ), # Choice("See list of languages", "languages"), - Choice("Exit", "exit"), + questionary.Choice("Exit", "exit"), ] else: choices = [ - Choice("Configure get data request", "configure"), - Choice("Exit", "exit"), + questionary.Choice("Configure get data request", "configure"), + questionary.Choice("Exit", "exit"), ] if config.configured: - choices.insert(1, Choice("Request for get data", "run")) + choices.insert(1, questionary.Choice("Request for get data", "run")) else: - choices.insert(1, Choice("Request for total lexeme", "total")) + choices.insert( + 1, questionary.Choice("Request for total lexeme", "total") + ) choice = questionary.select("What would you like to do?", choices=choices).ask() diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index a2aebfce..72175879 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -70,8 +70,8 @@ def list_data_types(language: str = None) -> None: Parameters ---------- - language : str - The language to potentially list data types for. + language : str + The language to potentially list data types for. """ languages = list_all_languages(language_metadata) if language: @@ -142,8 +142,8 @@ def list_languages_for_data_type(data_type: str) -> None: Parameters ---------- - data_type : str - The data type to check for. + data_type : str + The data type to check for. """ data_type = correct_data_type(data_type=data_type) all_languages = list_languages_with_metadata_for_data_type(language_metadata) @@ -179,14 +179,14 @@ def list_wrapper( Parameters ---------- - language : str - The language to potentially list data types for. + language : str + The language to potentially list data types for. - data_type : str - The data type to check for. + data_type : str + The data type to check for. - all_bool : boolean - Whether all languages and data types should be listed. + all_bool : boolean + Whether all languages and data types should be listed. """ if (not language and not data_type) or all_bool: list_all() diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 8baa7873..e22f4aea 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -24,8 +24,8 @@ import argparse from pathlib import Path -from rich import print as rprint from questionary import select +from rich import print as rprint from scribe_data.cli.cli_utils import validate_language_and_data_type from scribe_data.cli.convert import convert_wrapper @@ -36,7 +36,6 @@ from scribe_data.cli.total import total_wrapper from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message - from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." @@ -291,8 +290,8 @@ def main() -> None: download_parser = subparsers.add_parser( "download", aliases=["d"], - help="Download Wikidata dumps.", - description="Download Wikidata dumps from dumps.wikimedia.org.", + help="Download Wikidata lexeme dumps.", + description="Download Wikidata lexeme dumps from dumps.wikimedia.org.", epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) @@ -302,7 +301,7 @@ def main() -> None: "--wikidata-dump-version", nargs="?", const="latest", - help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.", + help="Download Wikidata lexeme dump. Optionally specify date in YYYYMMDD format.", ) download_parser.add_argument( "-wdp", @@ -428,7 +427,7 @@ def main() -> None: action = select( "What would you like to do?", choices=[ - "Download a Wikidata dump", + "Download a Wikidata lexemes dump", "Check for totals", "Get data", "Get translations", @@ -436,16 +435,21 @@ def main() -> None: ], ).ask() - if action == "Download a Wikidata dump": + if action == "Download a Wikidata lexemes dump": wd_lexeme_dump_download_wrapper() + elif action == "Check for totals": start_interactive_mode(operation="total") + elif action == "Get data": start_interactive_mode(operation="get") + elif action == "Get translations": start_interactive_mode(operation="translations") + else: print("Skipping action") + else: parser.print_help() diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index e543256e..8d86d7fe 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -35,8 +35,7 @@ language_to_qid, list_all_languages, ) -from scribe_data.wikidata.wikidata_utils import sparql -from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql def get_qid_by_input(input_str): @@ -45,13 +44,13 @@ def get_qid_by_input(input_str): Parameters ---------- - input_str : str - The input string representing a language or data type. + input_str : str + The input string representing a language or data type. Returns ------- - str or None - The QID corresponding to the input string, or- None if not found. + str or None + The QID corresponding to the input string, or- None if not found. """ if input_str: if input_str in language_to_qid: @@ -69,13 +68,13 @@ def get_datatype_list(language): Parameters ---------- - language : str - The language to return data types for. + language : str + The language to return data types for. Returns ------- - data_types : list[str] or None - A list of the corresponding data types. + data_types : list[str] or None + A list of the corresponding data types. """ language_key = language.strip().lower() # normalize input languages = list_all_languages(language_metadata) @@ -129,18 +128,18 @@ def check_qid_is_language(qid: str): """ Parameters ---------- - qid : str - The QID to check Wikidata to see if it's a language and return its English label. + qid : str + The QID to check Wikidata to see if it's a language and return its English label. Outputs ------- - str - The English label of the Wikidata language entity. + str + The English label of the Wikidata language entity. Raises ------ - ValueError - An invalid QID that's not a language has been passed. + ValueError + An invalid QID that's not a language has been passed. """ api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0" request_string = f"{api_endpoint}/entities/items/{qid}" @@ -167,13 +166,13 @@ def print_total_lexemes(language: str = None): Parameters ---------- - language : str (Default=None) - The language to display data type entity counts for. + language : str (Default=None) + The language to display data type entity counts for. Outputs ------- - str - A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. + str + A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. """ if language is None: print("Returning total counts for all languages and data types...\n") @@ -379,24 +378,24 @@ def total_wrapper( Parameters ---------- - language : Union[str, List[str]] - The language(s) to potentially total data types for. + language : Union[str, List[str]] + The language(s) to potentially total data types for. - data_type : Union[str, List[str]] - The data type(s) to check for. + data_type : Union[str, List[str]] + The data type(s) to check for. - all_bool : boolean - Whether all languages and data types should be listed. + all_bool : boolean + Whether all languages and data types should be listed. - wikidata_dump : Union[str, bool] - The local Wikidata dump path that can be used to process data. - If True, indicates the flag was used without a path. + wikidata_dump : Union[str, bool] + The local Wikidata lexeme dump path that can be used to process data. + If True, indicates the flag was used without a path. """ # Handle --all flag if all_bool and wikidata_dump: language = "all" - if wikidata_dump is True: # flag without a wikidata dump path + if wikidata_dump is True: # flag without a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, wikidata_dump_type=["total"], @@ -404,7 +403,7 @@ def total_wrapper( ) return - if isinstance(wikidata_dump, str): # if user provided a wikidata dump path + if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, wikidata_dump_type=["total"], diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index 2661f48d..1d33b158 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -44,16 +44,17 @@ def generate_emoji(language, output_dir: str = None): Parameters ---------- - language : str - The ISO code of the language for which to generate emoji keywords. + language : str + The ISO code of the language for which to generate emoji keywords. - output_dir : str, optional - The directory where the generated data will be saved. - If not specified, the data will be saved in a default directory. + output_dir : str, optional + The directory where the generated data will be saved. + If not specified, the data will be saved in a default directory. Returns ------- - None: The function does not return any value but outputs data to the specified directory. + None + The function does not return any value but outputs data to the specified directory. """ if check_and_install_pyicu() and check_if_pyicu_installed() is False: print("Thank you.") diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index abdf2363..eb373862 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -57,15 +57,15 @@ def gen_emoji_lexicon( Parameters ---------- - language : string (default=None) - The language keywords are being generated for. + language : string (default=None) + The language keywords are being generated for. - emojis_per_keyword : int (default=None) - The limit for number of emoji keywords that should be generated per keyword. + emojis_per_keyword : int (default=None) + The limit for number of emoji keywords that should be generated per keyword. Returns ------- - Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps. + Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps. """ if not icu_installed: raise ImportError("Could not import required PyICU functionality.") diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 163af4ae..311478bc 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -22,17 +22,17 @@ """ import ast +import contextlib import json import os import re -import questionary from datetime import datetime from importlib import resources from pathlib import Path from typing import Any, Optional +import questionary from rich import print as rprint -from questionary import select # MARK: Utils Variables @@ -86,7 +86,6 @@ for lang, lang_data in language_metadata.items(): if "sub_languages" in lang_data: for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_lower = sub_lang sub_qid = sub_lang_data.get("qid") if sub_qid is None: @@ -95,8 +94,8 @@ ) else: - language_map[sub_lang_lower] = sub_lang_data - language_to_qid[sub_lang_lower] = sub_qid + language_map[sub_lang] = sub_lang_data + language_to_qid[sub_lang] = sub_qid else: qid = lang_data.get("qid") @@ -114,15 +113,15 @@ def _load_json(package_path: str, file_name: str) -> Any: Parameters ---------- - package_path : str - The fully qualified package that contains the resource. + package_path : str + The fully qualified package that contains the resource. - file_name : str - The name of the file (resource) that contains the JSON data. + file_name : str + The name of the file (resource) that contains the JSON data. Returns ------- - A python entity representing the JSON content. + A python entity representing the JSON content. """ with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" @@ -143,25 +142,26 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - Parameters ---------- - source_value : str - The source value to find equivalents for (e.g., 'english', 'nynorsk'). + source_value : str + The source value to find equivalents for (e.g., 'english', 'nynorsk'). - source_key : str - The source key to reference (e.g., 'language'). + source_key : str + The source key to reference (e.g., 'language'). - target_key : str - The key to target (e.g., 'qid'). + target_key : str + The key to target (e.g., 'qid'). - error_msg : str - The message displayed when a value cannot be found. + error_msg : str + The message displayed when a value cannot be found. Returns ------- - The 'target' value given the passed arguments. + The 'target' value given the passed arguments. Raises ------ - ValueError : when a source_value is not supported or the language only has sub-languages. + ValueError + When a source_value is not supported or the language only has sub-languages. """ # Check if we're searching by language name. if source_key == "language": @@ -195,13 +195,13 @@ def get_language_qid(language: str) -> str: Parameters ---------- - language : str - The language the QID should be returned for. + language : str + The language the QID should be returned for. Returns ------- - str - The Wikidata QID for the language. + str + The Wikidata QID for the language. """ return _find( source_key="language", @@ -217,13 +217,13 @@ def get_language_iso(language: str) -> str: Parameters ---------- - language : str - The language the ISO should be returned for. + language : str + The language the ISO should be returned for. Returns ------- - str - The ISO code for the language. + str + The ISO code for the language. """ return _find( @@ -240,13 +240,13 @@ def get_language_from_iso(iso: str) -> str: Parameters ---------- - iso : str - The ISO the language name should be returned for. + iso : str + The ISO the language name should be returned for. Returns ------- - str - The name for the language which has an ISO value of iso. + str + The name for the language which has an ISO value of iso. """ # Iterate over the languages and their properties. for language, properties in _languages.items(): @@ -272,19 +272,19 @@ def load_queried_data( Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - tuple(Any, str) - A tuple containing the loaded data and the path to the data file. + tuple(Any, str) + A tuple containing the loaded data and the path to the data file. """ data_path = ( Path(dir_path) / language.lower().replace(" ", "_") / f"{data_type}.json" @@ -300,18 +300,18 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None: Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - None : The file is deleted. + None : The file is deleted. """ data_path = ( Path(dir_path) @@ -319,12 +319,9 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None: / f"{data_type}_queried.json" ) - try: + with contextlib.suppress(OSError): os.remove(data_path) - except OSError: - pass - def export_formatted_data( dir_path: str, @@ -338,21 +335,21 @@ def export_formatted_data( Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - formatted_data : dict - The data to be exported. + formatted_data : dict + The data to be exported. - language : str - The language for which the data is being exported. + language : str + The language for which the data is being exported. - data_type : str - The type of data being exported (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being exported (e.g. 'nouns', 'verbs'). Returns ------- - None + None """ export_path = ( Path(dir_path) @@ -375,13 +372,13 @@ def get_ios_data_path(language: str) -> str: Parameters ---------- - language : str - The language the path should be returned for. + language : str + The language the path should be returned for. Returns ------- - str - The path to the language folder for the given language. + str + The path to the language folder for the given language. """ return Path("Scribe-iOS") / "Keyboards" / "LanguageKeyboards" / f"{language}" @@ -392,13 +389,13 @@ def get_android_data_path() -> str: Parameters ---------- - language : str - The language the path should be returned for. + language : str + The language the path should be returned for. Returns ------- - str - The path to the assets data folder for the application. + str + The path to the assets data folder for the application. """ return Path("Scribe-Android") / "app" / "src" / "main" / "assets" / "data" @@ -411,19 +408,19 @@ def check_command_line_args( Parameters ---------- - file_name : str - The name of the file for clear error outputs if necessary. + file_name : str + The name of the file for clear error outputs if necessary. - passed_values : UNKNOWN (will be checked) - An argument to be checked against known values. + passed_values : UNKNOWN (will be checked) + An argument to be checked against known values. - values_to_check : list(str) - The values that should be checked against. + values_to_check : list(str) + The values that should be checked against. Returns ------- - args: list(str) - The arguments or an error are returned depending on if they're correct. + args: list(str) + The arguments or an error are returned depending on if they're correct. """ try: args = ast.literal_eval(passed_values) @@ -466,19 +463,19 @@ def check_and_return_command_line_args( Parameters ---------- - all_args : list[str] - The arguments passed to the Scribe-Data file. + all_args : list[str] + The arguments passed to the Scribe-Data file. - first_args_check : list[str] - The values that the first argument should be checked against. + first_args_check : list[str] + The values that the first argument should be checked against. - second_args_check : list[str] - The values that the second argument should be checked against. + second_args_check : list[str] + The values that the second argument should be checked against. Returns ------- - first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]] - The subset of possible first and second arguments that have been verified as being valid. + first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]] + The subset of possible first and second arguments that have been verified as being valid. """ if len(all_args) == 1: return None, None @@ -523,29 +520,30 @@ def format_sublanguage_name(lang, language_metadata=_languages): Parameters ---------- - lang : str - The name of the language or sub-language to format. + lang : str + The name of the language or sub-language to format. - language_metadata : dict - The metadata containing information about main languages and their sub-languages. + language_metadata : dict + The metadata containing information about main languages and their sub-languages. Returns ------- - str - The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian'). - Otherwise the original name. + str + The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian'). + Otherwise the original name. Raises ------ - ValueError: If the provided language or sub-language is not found. + ValueError + If the provided language or sub-language is not found. - Example - ------- - > format_sublanguage_name("nynorsk", language_metadata) - 'Nynorsk Norwegian' + Examples + -------- + > format_sublanguage_name("nynorsk", language_metadata) + 'Nynorsk Norwegian' - > format_sublanguage_name("english", language_metadata) - 'English' + > format_sublanguage_name("english", language_metadata) + 'English' """ for main_lang, lang_data in language_metadata.items(): # If it's not a sub-language, return the original name. @@ -598,14 +596,15 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages): # Check if there are sub-languages. if "sub_languages" in lang_data: # Add the sub-languages to current_languages with metadata. - for sub_key, sub_data in lang_data["sub_languages"].items(): - current_languages.append( - { - "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}", - "iso": sub_data.get("iso", ""), - "qid": sub_data.get("qid", ""), - } - ) + current_languages.extend( + { + "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}", + "iso": sub_data.get("iso", ""), + "qid": sub_data.get("qid", ""), + } + for sub_key, sub_data in lang_data["sub_languages"].items() + ) + else: # If no sub-languages, add the main language with metadata. current_languages.append( @@ -638,12 +637,12 @@ def check_lexeme_dump_prompt_download(output_dir: str): Parameters ---------- - output_dir : str - The directory to check for the existence of a Wikidata lexeme dump. + output_dir : str + The directory to check for the existence of a Wikidata lexeme dump. Returns ------- - None : The user is prompted to download a new Wikidata dump after the existence of one is checked. + None : The user is prompted to download a new Wikidata lexeme dump after the existence of one is checked. """ existing_dumps = list(Path(output_dir).glob("*.json.bz2")) if existing_dumps: @@ -651,7 +650,7 @@ def check_lexeme_dump_prompt_download(output_dir: str): for dump in existing_dumps: rprint(f" - {Path(output_dir)}/{dump.name}") - user_input = select( + user_input = questionary.select( "Do you want to:", choices=[ "Delete existing dumps", @@ -661,17 +660,17 @@ def check_lexeme_dump_prompt_download(output_dir: str): ], ).ask() - if user_input.startswith("Delete"): + if user_input == "Delete existing dumps": for dump in existing_dumps: dump.unlink() rprint("[bold green]Existing dumps deleted.[/bold green]") - download_input = select( + download_input = questionary.select( "Do you want to download the latest lexeme dump?", choices=["Yes", "No"] ).ask() return download_input != "Yes" - elif user_input.startswith("Use"): + elif user_input == "Use existing latest dump": # Check for the latest dump file. latest_dump = None if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps): @@ -712,9 +711,13 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: Returns True if user chooses to skip (i.e., we do NOT proceed). Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed). - Parameters: - index_path: Path to check - overwrite_all: If True, automatically overwrite without prompting + Parameters + ---------- + index_path : pathlib.Path + The path to check. + + overwrite_all : cool (default=False) + If True, automatically overwrite without prompting. """ if index_path.exists(): if overwrite_all: @@ -727,6 +730,7 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: default="Skip process", ).ask() - # If user selects "Skip process", return True meaning "don't proceed" + # If user selects "Skip process", return True meaning "don't proceed". return choice == "Skip process" + return False diff --git a/src/scribe_data/wikidata/check_query/check.py b/src/scribe_data/wikidata/check_query/check.py index 41f1706a..955168b5 100644 --- a/src/scribe_data/wikidata/check_query/check.py +++ b/src/scribe_data/wikidata/check_query/check.py @@ -49,15 +49,15 @@ def ping(url: str, timeout: int) -> bool: Parameters ---------- - url : str - The URL to test. + url : str + The URL to test. - timeout : int - The maximum number of seconds to wait for a reply. + timeout : int + The maximum number of seconds to wait for a reply. Returns ------- - bool : True if connectivity is established or False otherwise. + bool : True if connectivity is established or False otherwise. """ try: with urllib.request.urlopen(url, timeout=timeout) as response: @@ -132,12 +132,12 @@ def check_sparql_file(fpath: str) -> Path: Parameters ---------- - fpath : str - The file to validate. + fpath : str + The file to validate. Returns ------- - Path : the validated file. + Path : the validated file. """ path = Path(fpath) @@ -156,19 +156,20 @@ def check_positive_int(value: str, err_msg: str) -> int: Parameters ---------- - value : str - The value to be validated. + value : str + The value to be validated. - err_msg : str - Used when value fails validation. + err_msg : str + Used when value fails validation. Returns ------- - int : the validated number. + int + The validated number. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ with contextlib.suppress(ValueError): number = int(value) @@ -184,16 +185,17 @@ def check_limit(limit: str) -> int: Parameters ---------- - limit : str - The LIMIT to be validated. + limit : str + The LIMIT to be validated. Returns ------- - int : the validated LIMIT. + int + The validated LIMIT. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ return check_positive_int(limit, "LIMIT must be an integer of value 1 or greater.") @@ -204,16 +206,17 @@ def check_timeout(timeout: str) -> int: Parameters ---------- - timeout : str - The timeout to be validated. + timeout : str + The timeout to be validated. Returns ------- - int : the validated timeout. + int + The validated timeout. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ return check_positive_int( timeout, "timeout must be an integer of value 1 or greater." @@ -226,12 +229,13 @@ def main(argv=None) -> int: Parameters ---------- - argv (default=None) - If set to None then argparse will use sys.argv as the arguments. + argv (default=None) + If set to None then argparse will use sys.argv as the arguments. Returns -------- - int : the exit status - 0 - success; any other value - failure. + int + The exit status - 0 - success; any other value - failure. """ cli = argparse.ArgumentParser( description=f"run SPARQL queries from the '{PROJECT_ROOT}' project", @@ -356,7 +360,8 @@ def error_report(failures: list[QueryExecutionException]) -> None: Parameters ---------- - failures (list[QueryExecutionException]) : failed queries. + failures : list[QueryExecutionException] + Failed queries. """ if not failures: return @@ -373,11 +378,11 @@ def success_report(successes: list[tuple[QueryFile, dict]], display: bool) -> No Parameters ---------- - successes : list[tuple[QueryFile, dict]] - Successful queries. + successes : list[tuple[QueryFile, dict]] + Successful queries. - display : bool - Whether there should be an output or not. + display : bool + Whether there should be an output or not. """ if not (display and successes): return diff --git a/src/scribe_data/wikidata/check_query/sparql.py b/src/scribe_data/wikidata/check_query/sparql.py index f702907f..b3c43d27 100644 --- a/src/scribe_data/wikidata/check_query/sparql.py +++ b/src/scribe_data/wikidata/check_query/sparql.py @@ -39,12 +39,13 @@ def sparql_context(url: str) -> SPARQL.SPARQLWrapper: Parameters ---------- - url : str - A valid URL of a SPARQL endpoint. + url : str + A valid URL of a SPARQL endpoint. Returns ------- - SPARQLWrapper : the context. + SPARQLWrapper + The context. """ context = SPARQL.SPARQLWrapper(url) context.setReturnFormat(SPARQL.JSON) @@ -61,21 +62,22 @@ def execute( Parameters ---------- - query : QueryFile - The SPARQL query to run. + query : QueryFile + The SPARQL query to run. - limit : int - The maximum number of results a query should return. + limit : int + The maximum number of results a query should return. - context : SPARQLWrapper - The SPARQL context. + context : SPARQLWrapper + The SPARQL context. - tries : int - The maximum number of times the query should be executed after failure. + tries : int + The maximum number of times the query should be executed after failure. Returns ------- - dict : the results of the query. + dict + The results of the query. """ def delay_in_seconds() -> int: diff --git a/src/scribe_data/wikidata/format_data.py b/src/scribe_data/wikidata/format_data.py index 68186dbe..2aa2db97 100644 --- a/src/scribe_data/wikidata/format_data.py +++ b/src/scribe_data/wikidata/format_data.py @@ -46,18 +46,18 @@ def format_data( Parameters ---------- - dir_path : str - The output directory path for results. + dir_path : str + The output directory path for results. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns _______ - A saved and formatted data file for the given language and data type. + A saved and formatted data file for the given language and data type. """ data_list, data_path = load_queried_data( dir_path=dir_path, language=language, data_type=data_type diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index e23be51e..bbe7c7b5 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -47,18 +47,18 @@ def execute_formatting_script(output_dir: str, language: str, data_type: str): Parameters ---------- - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - The results of the formatting script saved in the given output directory. + The results of the formatting script saved in the given output directory. """ formatting_file_path = Path(__file__).parent / "format_data.py" @@ -108,21 +108,21 @@ def query_data( Parameters ---------- - language : str - The language(s) to get. + language : str + The language(s) to get. - data_type : str - The data type(s) to get. + data_type : str + The data type(s) to get. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - overwrite : bool (default: False) - Whether to overwrite existing files. + overwrite : bool (default: False) + Whether to overwrite existing files. Returns ------- - Formatted data from Wikidata saved in the output directory. + Formatted data from Wikidata saved in the output directory. """ current_languages = list_all_languages(language_metadata) current_data_type = ["nouns", "verbs", "prepositions"] diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 14e5fc02..29182070 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -21,14 +21,15 @@ """ from pathlib import Path -from rich import print as rprint -from SPARQLWrapper import JSON, POST, SPARQLWrapper from typing import List, Union + import requests +from rich import print as rprint +from SPARQLWrapper import JSON, POST, SPARQLWrapper from scribe_data.cli.download import wd_lexeme_dump_download_wrapper +from scribe_data.utils import data_type_metadata, language_metadata from scribe_data.wiktionary.parse_dump import parse_dump -from scribe_data.utils import language_metadata, data_type_metadata sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(JSON) @@ -65,20 +66,24 @@ def parse_wd_lexeme_dump( wikidata_dump_path: str = None, ): """ - Checks for the existence of a Wikidata dump and parses it if possible. + Checks for the existence of a Wikidata lexeme dump and parses it if possible. Parameters ---------- language : Union[str, List[str]] The language(s) to parse the data for. Use "all" for all languages. + wikidata_dump_type : List[str] - The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]). + The type(s) of Wikidata lexeme dump to parse (e.g. ["total", "translations", "form"]). + data_types : List[str] The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]). + type_output_dir : str, optional The directory to save the parsed JSON data. If None, uses default directory. + wikidata_dump_path : str, optional - The local Wikidata dump directory that should be used to get data. + The local Wikidata lexeme dump directory that should be used to get data. """ # Convert "all" to list of all languages if isinstance(language, str) and language.lower() == "all": diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index 37482bee..c4b8b450 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -47,24 +47,24 @@ def download_wiki(language="en", target_dir="wiki_dump", file_limit=None, dump_i Parameters ---------- - language : str (default=en) - The language of Wikipedia to download. + language : str (default=en) + The language of Wikipedia to download. - target_dir : pathlib.Path (default=wiki_dump) - The directory in the pwd into which files should be downloaded. + target_dir : pathlib.Path (default=wiki_dump) + The directory in the pwd into which files should be downloaded. - file_limit : int (default=None, all files) - The limit for the number of files to download. + file_limit : int (default=None, all files) + The limit for the number of files to download. - dump_id : str (default=None) - The id of an explicit Wikipedia dump that the user wants to download. + dump_id : str (default=None) + The id of an explicit Wikipedia dump that the user wants to download. - Note: a value of None will select the third from the last (latest stable dump). + Note: a value of None will select the third from the last (latest stable dump). Returns ------- - file_info : list of lists - Information on the downloaded Wikipedia dump files. + file_info : list of lists + Information on the downloaded Wikipedia dump files. """ if file_limit is not None: assert isinstance( @@ -148,16 +148,16 @@ def _process_article(title, text): Parameters ---------- - title : str - The title of the article. + title : str + The title of the article. - text : str - The text to be processed. + text : str + The text to be processed. Returns ------- - title, text: string, string - The data from the article. + title, text: string, string + The data from the article. """ wikicode = mwparserfromhell.parse(text) @@ -173,24 +173,24 @@ def iterate_and_parse_file(args): Parameters ---------- - args : tuple - The below arguments as a tuple for pool.imap_unordered rather than pool.starmap. + args : tuple + The below arguments as a tuple for pool.imap_unordered rather than pool.starmap. - input_path : pathlib.Path - The path to the data file. + input_path : pathlib.Path + The path to the data file. - partitions_dir : pathlib.Path - The path to where output file should be stored. + partitions_dir : pathlib.Path + The path to where output file should be stored. - article_limit : int (default=None) - An optional article_limit of the number of articles to find. + article_limit : int (default=None) + An optional article_limit of the number of articles to find. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the processes. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the processes. Returns ------- - A parsed file Wikipedia dump file with articles. + A parsed file Wikipedia dump file with articles. """ input_path, partitions_dir, article_limit, verbose = args @@ -296,30 +296,30 @@ def parse_to_ndjson( Parameters ---------- - output_path : str (default=articles) - The name of the final output ndjson file. + output_path : str (default=articles) + The name of the final output ndjson file. - input_dir : str (default=wikipedia_dump) - The path to the directory where the data is stored. + input_dir : str (default=wikipedia_dump) + The path to the directory where the data is stored. - partitions_dir : str (default=partitions) - The path to the directory where the output should be stored. + partitions_dir : str (default=partitions) + The path to the directory where the output should be stored. - article_limit : int (default=None) - An optional limit of the number of articles per dump file to find. + article_limit : int (default=None) + An optional limit of the number of articles per dump file to find. - delete_parsed_files : bool (default=False) - Whether to delete the separate parsed files after combining them. + delete_parsed_files : bool (default=False) + Whether to delete the separate parsed files after combining them. - multicore : bool (default=True) - Whether to use multicore processing. + multicore : bool (default=True) + Whether to use multicore processing. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the processes. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the processes. Returns ------- - Wikipedia dump files parsed and converted to json files. + Wikipedia dump files parsed and converted to json files. """ output_dir = "/".join(list(output_path.split("/")[:-1])) if not output_dir.exists(): diff --git a/src/scribe_data/wikipedia/process_wiki.py b/src/scribe_data/wikipedia/process_wiki.py index bd5bbb16..458e5dda 100644 --- a/src/scribe_data/wikipedia/process_wiki.py +++ b/src/scribe_data/wikipedia/process_wiki.py @@ -54,25 +54,25 @@ def clean( Parameters ---------- - texts : str or list - The texts to be cleaned and tokenized. + texts : str or list + The texts to be cleaned and tokenized. - language : string (default=en) - The language of the texts being cleaned. + language : string (default=en) + The language of the texts being cleaned. - remove_words : str or list (default=None) - Strings that should be removed from the text body. + remove_words : str or list (default=None) + Strings that should be removed from the text body. - sample_size : float (default=1) - The amount of data to be randomly sampled. + sample_size : float (default=1) + The amount of data to be randomly sampled. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the process. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the process. Returns ------- - cleaned_texts : list - The texts formatted for analysis. + cleaned_texts : list + The texts formatted for analysis. """ if isinstance(texts, str): texts = [texts] @@ -331,27 +331,27 @@ def gen_autosuggestions( Parameters ---------- - text_corpus : list - The Wikipedia texts formatted for word relation extraction. + text_corpus : list + The Wikipedia texts formatted for word relation extraction. - language : string (default=en) - The language autosuggestions are being generated for. + language : string (default=en) + The language autosuggestions are being generated for. - num_words: int (default=500) - The number of words that autosuggestions should be generated for. + num_words: int (default=500) + The number of words that autosuggestions should be generated for. - ignore_words : str or list (default=None) - Strings that should be removed from the text body. + ignore_words : str or list (default=None) + Strings that should be removed from the text body. - update_local_data : bool (default=False) - Saves the created dictionaries as JSONs in the target directories. + update_local_data : bool (default=False) + Saves the created dictionaries as JSONs in the target directories. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the process. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the process. Returns ------- - Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps. + Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps. """ counter_obj = Counter(chain.from_iterable(text_corpus)) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 36bbbc69..45f00d19 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -22,19 +22,19 @@ import bz2 import time -import orjson - -from tqdm import tqdm +from collections import Counter, defaultdict from pathlib import Path -from collections import defaultdict, Counter -from typing import Union, List +from typing import List, Union + +import orjson +import questionary from scribe_data.utils import ( DEFAULT_DUMP_EXPORT_DIR, - language_metadata, - data_type_metadata, check_index_exists, + data_type_metadata, + language_metadata, ) -import questionary +from tqdm import tqdm class LexemeProcessor: @@ -51,38 +51,38 @@ def __init__( - 'total' data_types is a list of categories (e.g., ["nouns", "adverbs"]) for forms. """ - # Pre-compute sets for faster lookups + # Pre-compute sets for faster lookups. self.parse_type = set(parse_type or []) self.data_types = set(data_types or []) self.target_iso = set( [target_iso] if isinstance(target_iso, str) else target_iso or [] ) - # Pre-compute valid categories and languages + # Pre-compute valid categories and languages. self._category_lookup = {v: k for k, v in data_type_metadata.items()} self.valid_categories = set(data_type_metadata.values()) - # Build optimized language mapping + # Build optimized language mapping. self.iso_to_name = self._build_iso_mapping() self.valid_iso_codes = set(self.iso_to_name.keys()) - # Separate data structures + # Separate data structures. self.translations_index = defaultdict( lambda: defaultdict(lambda: defaultdict(dict)) ) self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - # Stats + # Stats. self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} - # For category lookups, invert data_type_metadata - # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...} + # For category lookups, invert data_type_metadata. + # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}. self._category_lookup = {v: k for k, v in data_type_metadata.items()} - # Build map from ISO to full language name + # Build map from ISO to full language name. self.iso_to_name = self._build_iso_mapping() - # For "total" usage + # For "total" usage. self.lexical_category_counts = defaultdict(Counter) self.translation_counts = defaultdict(Counter) self.forms_counts = defaultdict(Counter) @@ -97,9 +97,10 @@ def _build_iso_mapping(self) -> dict: for lang_name, data in language_metadata.items(): if self.target_iso and lang_name not in self.target_iso: continue - iso_code = data.get("iso") - if iso_code: + + if iso_code := data.get("iso"): iso_mapping[iso_code] = lang_name + return iso_mapping # MARK: process total @@ -116,16 +117,18 @@ def _process_lexeme_total(self, lexeme: dict) -> None: if not category_name: return - # Update counters + # Update counters. lemmas = lexeme.get("lemmas", {}) for lemma in lemmas.values(): lang = lemma.get("language") + if lang in self.iso_to_name: self.lexical_category_counts[lang][category_name] += 1 translation_count = sum( len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) ) self.translation_counts[lang][category_name] += translation_count + break # MARK: process translations @@ -144,7 +147,7 @@ def _process_lexeme_translations(self, lexeme: dict) -> None: if not category_name: return - # Only store first valid lemma for translations + # Only store first valid lemma for translations. for lang_code, lemma_data in lemmas.items(): if lang_code not in self.iso_to_name: continue @@ -153,7 +156,7 @@ def _process_lexeme_translations(self, lexeme: dict) -> None: if not word: continue - # Build translations from sense glosses + # Build translations from sense glosses. translations = {} for sense in lexeme.get("senses", []): for sense_lang_code, gloss in sense.get("glosses", {}).items(): @@ -162,7 +165,8 @@ def _process_lexeme_translations(self, lexeme: dict) -> None: if translations: self.translations_index[word][lang_code][category_name] = translations - break # Only handle the first lemma + + break # only handle the first lemma # MARK: process forms def _process_lexeme_forms(self, lexeme: dict) -> None: @@ -173,7 +177,7 @@ def _process_lexeme_forms(self, lexeme: dict) -> None: lemmas = lexeme.get("lemmas", {}) lexical_category = lexeme.get("lexicalCategory") - # Skip if category missing or not recognized + # Skip if category missing or not recognized. if not lexical_category or lexical_category not in data_type_metadata.values(): return @@ -183,11 +187,11 @@ def _process_lexeme_forms(self, lexeme: dict) -> None: return # If the category_name is NOT in our data_types list, skip - # e.g., category_name = "nouns", but user didn't request "nouns" in data_types + # e.g., category_name = "nouns", but user didn't request "nouns" in data_types. if category_name not in self.data_types: return - # Process forms + # Process forms. for lang_code, lemma_data in lemmas.items(): if lang_code not in self.iso_to_name: continue @@ -203,29 +207,29 @@ def _process_lexeme_forms(self, lexeme: dict) -> None: for rep_lang, rep_data in representations.items(): if rep_lang == lang_code: - form_value = rep_data.get("value") - if form_value: + if form_value := rep_data.get("value"): forms_data[form_value].extend(grammatical_features) if forms_data: self.forms_index[word][lang_code][category_name] = dict(forms_data) self.forms_counts[lang_code][category_name] += len(forms_data) + break # only first valid lemma # MARK: process lines def process_lines(self, line: str) -> None: """ Process one line of data. Depending on parse_type, we do: - - total stats - - translations - - form categories (filtered by data_types) + - total stats + - translations + - form categories (filtered by data_types) """ try: lexeme = orjson.loads(line.strip().rstrip(",")) if not lexeme: return - # Get common values once + # Get common values once. lemmas = lexeme.get("lemmas", {}) lexical_category = lexeme.get("lexicalCategory") @@ -236,7 +240,7 @@ def process_lines(self, line: str) -> None: if not category_name: return - # Process each type in a single pass through the data + # Process each type in a single pass through the data. for lang_code, lemma_data in lemmas.items(): if lang_code not in self.valid_iso_codes: continue @@ -256,13 +260,12 @@ def process_lines(self, line: str) -> None: ) if "translations" in self.parse_type: - translations = { + if translations := { lang: gloss["value"] for sense in lexeme.get("senses", []) for lang, gloss in sense.get("glosses", {}).items() if lang in self.valid_iso_codes - } - if translations: + }: self.translations_index[word][lang_code][category_name] = ( translations ) @@ -274,8 +277,7 @@ def process_lines(self, line: str) -> None: "representations", {} ).items(): if rep_lang == lang_code: - form_value = rep_data.get("value") - if form_value: + if form_value := rep_data.get("value"): forms_data[form_value].extend( form.get("grammaticalFeatures", []) ) @@ -286,7 +288,7 @@ def process_lines(self, line: str) -> None: ) self.forms_counts[lang_code][category_name] += len(forms_data) - break # Only process first valid lemma + break # only process first valid lemma except Exception as e: print(f"Error processing line: {e}") @@ -296,14 +298,14 @@ def process_file(self, file_path: str, batch_size: int = 50000): """ Main loop: read lines from file (bz2) in batches, call process_lines on each. """ - # Use context manager for better resource handling + # Use context manager for better resource handling. with bz2.open(file_path, "rt", encoding="utf-8") as bzfile: - # Skip header if present + # Skip header if present. first_line = bzfile.readline() if not first_line.strip().startswith("["): bzfile.seek(0) - # Process in larger batches for better performance + # Process in larger batches for better performance. batch = [] start_time = time.time() total_entries = int(Path(file_path).stat().st_size / 263) @@ -311,28 +313,29 @@ def process_file(self, file_path: str, batch_size: int = 50000): for line in tqdm(bzfile, total=total_entries, desc="Processing entries"): if line.strip() not in ["[", "]", ",", ""]: batch.append(line) + if len(batch) >= batch_size: self._process_batch(batch) - batch.clear() # More efficient than creating new list + batch.clear() # more efficient than creating new list self.stats["processed_entries"] += 1 - # Process remaining items + # Process remaining items. if batch: self._process_batch(batch) - # Update stats + # Update stats. self.stats["processing_time"] = time.time() - start_time self.stats["unique_words"] = len(self.forms_index) + len( self.translations_index ) - # Print summary if "total" was requested + # Print summary if "total" was requested. if "total" in self.parse_type: self._print_total_summary() def _process_batch(self, batch: list) -> None: """ - Process a batch of lines + Process a batch of lines. """ for line in batch: self.process_lines(line) @@ -340,7 +343,7 @@ def _process_batch(self, batch: list) -> None: # MARK: print total summary def _print_total_summary(self): """ - Print stats if parse_type == total + Print stats if parse_type == total. """ print( f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25} {'Total Translations':<20}" @@ -349,15 +352,19 @@ def _print_total_summary(self): for lang, counts in self.lexical_category_counts.items(): lang_name = self.iso_to_name[lang] first_row = True + for category, count in counts.most_common(): trans_count = self.translation_counts[lang][category] + if first_row: print( f"{lang_name:<20} {category:<25} {count:<25,} {trans_count:<20,}" ) first_row = False + else: print(f"{'':<20} {category:<25} {count:<25,} {trans_count:<20,}") + if lang != list(self.lexical_category_counts.keys())[-1]: print("\n" + "=" * 90 + "\n") @@ -372,12 +379,12 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N f"Warning: ISO {language_iso} unknown, skipping translations export..." ) return - # Filter - filtered = {} - for word, lang_data in self.translations_index.items(): - if language_iso in lang_data: - filtered[word] = {language_iso: lang_data[language_iso]} + filtered = { + word: {language_iso: lang_data[language_iso]} + for word, lang_data in self.translations_index.items() + if language_iso in lang_data + } self._save_by_language(filtered, filepath, language_iso, "translations") # MARK: export forms @@ -386,8 +393,8 @@ def export_forms_json( ) -> None: """ Save forms_index to file, optionally filtering by: - - language_iso - - data_type (e.g. "nouns", "adverbs") + - language_iso + - data_type (e.g. "nouns", "adverbs") If data_type is given, we only export that one category from forms. """ @@ -395,10 +402,11 @@ def export_forms_json( if language_iso not in self.iso_to_name: print(f"Warning: ISO {language_iso} unknown, skipping forms export...") return + filtered = {} for word, lang_data in self.forms_index.items(): if language_iso in lang_data: - # If data_type is given, only keep that category + # If data_type is given, only keep that category. if data_type: if data_type in lang_data[language_iso]: filtered[word] = { @@ -406,15 +414,17 @@ def export_forms_json( data_type: lang_data[language_iso][data_type] } } + else: filtered[word] = {language_iso: lang_data[language_iso]} + self._save_by_language( filtered, filepath, language_iso, data_type or "forms" ) def _save_by_language(self, data, filepath, language_iso, category_type): """ - Save data to exports//filename + Save data to exports//filename. """ base_path = Path(filepath) lang_name = self.iso_to_name[language_iso] @@ -437,6 +447,7 @@ def _to_dict(self, dd): """ if isinstance(dd, defaultdict): dd = {k: self._to_dict(v) for k, v in dd.items()} + return dd @@ -456,18 +467,23 @@ def parse_dump( ---------- language : str or list of str, optional Language(s) to parse data for. Must match language names in language_metadata. + parse_type : list of str, optional Types of parsing to perform. Valid options are: - 'translations': Extract word translations - 'form': Extract grammatical forms - 'total': Gather statistical totals + data_types : list of str, optional Categories to parse when using 'form' type (e.g. ["nouns", "adverbs"]). Only used if 'form' is in parse_type. + file_path : str, default="latest-lexemes.json.bz2" Path to the lexeme dump file + output_dir : str, optional Directory to save output files. If None, uses DEFAULT_DUMP_EXPORT_DIR. + overwrite_all : bool, default=False If True, automatically overwrite existing files without prompting @@ -480,11 +496,11 @@ def parse_dump( If a requested index file already exists, that language/category combination will be skipped. """ - # 1) Prepare environment - Use default if output_dir is None + # Prepare environment - Use default if output_dir is None. output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR Path(output_dir).mkdir(parents=True, exist_ok=True) - # Convert single strings to lists + # Convert single strings to lists. languages = [language] if isinstance(language, str) else language parse_type = parse_type or [] data_types = data_types or [] @@ -503,20 +519,22 @@ def parse_dump( if choice == "Overwrite existing data": overwrite_all = True - # For translations, we only need to check the translations index + # For translations, we only need to check the translations index. if "translations" in parse_type: languages_to_process = [] for lang in languages: index_path = Path(output_dir) / lang / "lexeme_translations.json" + if not check_index_exists(index_path, overwrite_all): languages_to_process.append(lang) + else: print(f"Skipping {lang}/translations.json - already exists") - # Update languages list but keep data_types as is + # Update languages list but keep data_types as is. languages = languages_to_process - # For forms, check each language/data_type combination + # For forms, check each language/data_type combination. elif "form" in parse_type: languages_to_process = [] data_types_to_process = set() @@ -525,16 +543,18 @@ def parse_dump( needs_processing = False for data_type in data_types: index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json" + if not check_index_exists(index_path, overwrite_all): needs_processing = True data_types_to_process.add(data_type) + else: print(f"Skipping {lang}/{data_type}.json - already exists") if needs_processing: languages_to_process.append(lang) - # Update both lists + # Update both lists. languages = languages_to_process data_types = list(data_types_to_process) @@ -553,11 +573,11 @@ def parse_dump( # MARK: Handle JSON exports - # (a) If "translations" in parse_type -> export them + # (a) If "translations" in parse_type -> export them. if "translations" in parse_type: index_path = Path(output_dir) / "lexeme_translations.json" - # Export translations for each ISO found + # Export translations for each ISO found. iso_codes = set() for word_data in processor.translations_index.values(): iso_codes.update(word_data.keys()) @@ -565,9 +585,9 @@ def parse_dump( if iso_code in processor.iso_to_name: processor.export_translations_json(str(index_path), iso_code) - # (b) If "form" in parse_type -> export forms for each data_type in data_types + # (b) If "form" in parse_type -> export forms for each data_type in data_types. if "form" in parse_type: - # For each data_type, we create a separate file, e.g. lexeme_nouns.json + # For each data_type, we create a separate file, e.g. lexeme_nouns.json. for dt in data_types: index_path = Path(output_dir) / f"lexeme_{dt}.json" print(f"Exporting forms for {dt} to {index_path}...") diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py index 5faa486a..6968c8ad 100644 --- a/src/scribe_data/wiktionary/parse_mediaWiki.py +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -1,5 +1,5 @@ """ - Functions to parse the translations of a word from MediaWiki API. +Functions to parse the translations of a word from MediaWiki API. .. raw:: html """ -import re import json -from scribe_data.wikidata.wikidata_utils import mediaWiki_query +import re + from scribe_data.utils import get_language_from_iso +from scribe_data.wikidata.wikidata_utils import mediaWiki_query def fetch_translation_page(word): data = mediaWiki_query(word) pages = data.get("query", {}).get("pages", {}) - # Extract page object from dictionary + # Extract page object from dictionary. page = next(iter(pages.values())) if pages else {} - # Get the wikitext from the 'revisions' key - wikitext = page.get("revisions", [{}])[0].get("*", "") - return wikitext + + # Get the wikitext from the 'revisions' key. + return page.get("revisions", [{}])[0].get("*", "") def parse_wikitext_for_translations(wikitext): @@ -43,28 +44,29 @@ def parse_wikitext_for_translations(wikitext): language codes, part of speech, and context. """ translations_by_lang = {} - current_part_of_speech = None # Track whether we are in Noun or Verb - current_context = None # Track the current trans-top context + current_part_of_speech = None # track whether we are in Noun or Verb + current_context = None # track the current trans-top context # Split the wikitext into individual lines for line in wikitext.splitlines(): - # Detect part of speech/data-types: Noun or Verb + # Detect part of speech/data-types: Noun or Verb. if line.startswith("===Noun==="): current_part_of_speech = "Noun" + elif line.startswith("===Verb==="): current_part_of_speech = "Verb" - trans_top_match = re.match(r"\{\{trans-top\|(.+?)\}\}", line) - if trans_top_match: - current_context = trans_top_match.group(1).strip() - - template_match = re.match( - r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", line.strip() - ) - if template_match: - lang_code = template_match.group(2).strip() - translation_text = template_match.group(3).strip() - - # Ensure there's a list to hold translations for this language + + if trans_top_match := re.match(r"\{\{trans-top\|(.+?)\}\}", line): + current_context = trans_top_match[1].strip() + + if template_match := re.match( + r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", + line.strip(), + ): + lang_code = template_match[2].strip() + translation_text = template_match[3].strip() + + # Ensure there's a list to hold translations for this language. if lang_code not in translations_by_lang: translations_by_lang[lang_code] = [] @@ -84,17 +86,17 @@ def build_json_format(word, translations_by_lang): Build the final JSON format for the translations of a word. """ book_translations = {word: {}} - # Keep counters to number the translations for each (lang, part_of_speech) + # Keep counters to number the translations for each (lang, part_of_speech). language_counters = {} for lang_code, entries in translations_by_lang.items(): try: lang_name = get_language_from_iso(lang_code) except ValueError: - # Skip this language if it's not supported + # Skip this language if it's not supported. continue - # Make sure this language is in the dictionary + # Make sure this language is in the dictionary. if lang_name not in book_translations[word]: book_translations[word][lang_name] = {} @@ -109,7 +111,7 @@ def build_json_format(word, translations_by_lang): idx = str(language_counters[(lang_code, pos)]) - # Insert the item at the next available index + # Insert the item at the next available index. book_translations[word][lang_name][pos][idx] = { "description": desc, "translations": trans, diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py index 29b24751..d4987b22 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/test_download.py @@ -127,7 +127,7 @@ def test_wd_lexeme_dump_download_wrapper_latest( mock_get.return_value.headers = {"content-length": "100"} mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10 - # Mock DEFAULT_DUMP_EXPORT_DIR + # Mock DEFAULT_DUMP_EXPORT_DIR. with patch( "scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir" ): @@ -137,7 +137,7 @@ def test_wd_lexeme_dump_download_wrapper_latest( mock_makedirs.assert_called_with("test_export_dir", exist_ok=True) mock_confirm.assert_called_once() - @patch("scribe_data.utils.select") + @patch("scribe_data.utils.questionary.select") @patch( "scribe_data.utils.Path.glob", return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], @@ -146,7 +146,7 @@ def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select """ Test prompt for using existing lexeme dump files. """ - # Mock the select dialog to return "Use existing latest dump" + # Mock the select dialog to return "Use existing latest dump". mock_select.return_value.ask.return_value = "Use existing latest dump" result = check_lexeme_dump_prompt_download( @@ -154,7 +154,7 @@ def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select ) self.assertEqual(result.name, "latest-lexemes.json.bz2") - @patch("scribe_data.utils.select") + @patch("scribe_data.utils.questionary.select") @patch( "scribe_data.utils.Path.glob", return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], @@ -163,7 +163,7 @@ def test_check_lexeme_dump_prompt_download_delete(self, mock_glob, mock_select): """ Test prompt for deleting existing lexeme dump files. """ - # Configure the mock to return "Delete existing dumps" first and then "No" + # Configure the mock to return "Delete existing dumps" first and then "No". mock_select.side_effect = [ MagicMock(ask=MagicMock(return_value="Delete existing dumps")), MagicMock(ask=MagicMock(return_value="No")), diff --git a/tests/wikidata/test_check_query.py b/tests/wikidata/test_check_query.py index e50b3955..19b3097b 100755 --- a/tests/wikidata/test_check_query.py +++ b/tests/wikidata/test_check_query.py @@ -25,6 +25,7 @@ from pathlib import Path from unittest.mock import MagicMock, mock_open, patch from urllib.error import HTTPError + import pytest from scribe_data.wikidata.check_query.check import ( all_queries, From f28a176f07ab829ba3232faa7daafada39861d56 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 4 Jan 2025 14:19:19 +0100 Subject: [PATCH 13/13] Minor fix to the contribution guide --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f929c112..d7f767fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -300,7 +300,7 @@ def example_function(argument: argument_type) -> return_type: Parameters ---------- - argument: argument_type + argument : argument_type Description of your argument. Returns