diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 924927ecb..d7f767fea 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -300,13 +300,18 @@ def example_function(argument: argument_type) -> return_type: Parameters ---------- - argument: argument_type - Description of your argument. + argument : argument_type + Description of your argument. Returns ------- - return_value : return_type - Description of your return value. + return_value : return_type + Description of your return value. + + Raises + ------ + ErrorType + Description of the error and the condition that raises it. """ ... diff --git a/requirements.txt b/requirements.txt index abbd5e443..4e1d6d554 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ ruff>=0.3.3 SPARQLWrapper>=2.0.0 sphinx-rtd-theme>=3.0.0 tqdm==4.66.4 +orjson>=3.10.12 diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py index 84523ba25..159d8ca21 100644 --- a/src/scribe_data/check/check_project_metadata.py +++ b/src/scribe_data/check/check_project_metadata.py @@ -88,16 +88,16 @@ def get_missing_languages( Parameters ---------- - reference_languages : dict - A dictionary of languages from the reference source. + reference_languages : dict + A dictionary of languages from the reference source. - target_languages : dict - A dictionary of languages from the target source to check for missing entries. + target_languages : dict + A dictionary of languages from the target source to check for missing entries. Returns ------- - list[str] - A list of languages and sub-languages that are in target_languages but not in reference_languages. + list[str] + A list of languages and sub-languages that are in target_languages but not in reference_languages. """ missing_languages = [] reference_keys = reference_languages.keys() @@ -130,17 +130,17 @@ def validate_language_properties(languages_dict: dict) -> dict: Parameters ---------- - languages_dict : dict - A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key. + languages_dict : dict + A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key. Returns ------- - dict: A dictionary with two lists: - - "missing_qids": Languages or sub-languages missing the 'qid' property. - - "missing_isos": Languages or sub-languages missing the 'iso' property. + dict: A dictionary with two lists: + - "missing_qids": Languages or sub-languages missing the 'qid' property. + - "missing_isos": Languages or sub-languages missing the 'iso' property. - Each entry in these lists is in the format "parent_language - sub_language" for sub-languages, - or simply "parent_language" for the parent languages. + Each entry in these lists is in the format "parent_language - sub_language" for sub-languages, + or simply "parent_language" for the parent languages. """ missing_qids = [] missing_isos = [] diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py index 612299dfb..edfe90853 100644 --- a/src/scribe_data/check/check_project_structure.py +++ b/src/scribe_data/check/check_project_structure.py @@ -48,24 +48,24 @@ def check_for_sparql_files(folder_path, data_type, language, subdir, missing_que Parameters ---------- - folder_path : str - The path to the data-type folder. + folder_path : str + The path to the data-type folder. - data_type : str - The name of the data type being checked. + data_type : str + The name of the data type being checked. - language : str - The name of the language being processed. + language : str + The name of the language being processed. - subdir : str or None - The name of the sub-directory (for languages with sub-dialects), or None. + subdir : str or None + The name of the sub-directory (for languages with sub-dialects), or None. - missing_queries : list - A list to which missing SPARQL query files will be appended. + missing_queries : list + A list to which missing SPARQL query files will be appended. Returns ------- - bool: True if at least one .sparql file is found, False otherwise. + bool: True if at least one .sparql file is found, False otherwise. """ sparql_files = [f for f in os.listdir(folder_path) if f.endswith(".sparql")] diff --git a/src/scribe_data/check/check_pyicu.py b/src/scribe_data/check/check_pyicu.py index a2d645ce5..456d4ed82 100644 --- a/src/scribe_data/check/check_pyicu.py +++ b/src/scribe_data/check/check_pyicu.py @@ -27,8 +27,8 @@ from pathlib import Path import pkg_resources +import questionary import requests -from questionary import confirm def check_if_pyicu_installed(): @@ -90,15 +90,15 @@ def download_wheel_file(wheel_url, output_dir): Parameters ---------- - wheel_url : str - The URL of the wheel file to download. + wheel_url : str + The URL of the wheel file to download. - output_dir : str - The directory to save the downloaded file. + output_dir : str + The directory to save the downloaded file. Returns ------- - str : path to the downloaded wheel file. + str : path to the downloaded wheel file. """ response = requests.get(wheel_url) response.raise_for_status() # raise an error for bad responses @@ -118,18 +118,18 @@ def find_matching_wheel(wheels, python_version, architecture): Parameters ---------- - wheels : list - The list of available wheels. + wheels : list + The list of available wheels. - python_version : str - The Python version (e.g., 'cp311'). + python_version : str + The Python version (e.g., 'cp311'). - architecture : str - The architecture type (e.g., 'win_amd64'). + architecture : str + The architecture type (e.g., 'win_amd64'). Returns ------- - str : The download URL of the matching wheel or None if not found. + str : The download URL of the matching wheel or None if not found. """ return next( ( @@ -148,8 +148,7 @@ def check_and_install_pyicu(): # Fetch available wheels from GitHub to estimate download size. wheels, total_size_mb = fetch_wheel_releases() - # Use questionary to ask for user confirmation - user_wants_to_proceed = confirm( + user_wants_to_proceed = questionary.confirm( f"{package_name} is not installed.\nScribe-Data can install the package and the needed dependencies." f"\nApproximately {total_size_mb:.2f} MB will be downloaded.\nDo you want to proceed?" ).ask() diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index 12c4d96df..5435c8447 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -57,18 +57,18 @@ def extract_forms_from_sparql(file_path: Path) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract forms. + file_path : Path + The path to the SPARQL query file from which to extract forms. Returns ------- - query_form_dict : dict - The file path with form labels of the query and their respective QIDs. + query_form_dict : dict + The file path with form labels of the query and their respective QIDs. Raises ------ - FileNotFoundError - If the specified file does not exist. + FileNotFoundError + If the specified file does not exist. """ optional_pattern = r"\s\sOPTIONAL\s*\{([^}]*)\}" try: @@ -95,13 +95,13 @@ def extract_form_rep_label(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - str - The label of the form representation. + str + The label of the form representation. """ onto_rep_pattern = r"ontolex:representation .* ;" if line_match := re.search(pattern=onto_rep_pattern, string=form_text): @@ -119,13 +119,13 @@ def decompose_label_features(label): Parameters ---------- - label : str - The concatenated label string composed of several grammatical features. + label : str + The concatenated label string composed of several grammatical features. Returns ------- - list - A list of grammatical features extracted from the label in their original order. + list + A list of grammatical features extracted from the label in their original order. """ components = re.findall(r"[A-Za-z][^A-Z]*", label) valid_components = [] @@ -157,13 +157,13 @@ def extract_form_qids(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - list[str] - All QIDS that make up the form. + list[str] + All QIDS that make up the form. """ qids_pattern = r"wikibase:grammaticalFeature .+ \." if match := re.search(pattern=qids_pattern, string=form_text): @@ -179,13 +179,13 @@ def check_form_label(form_text: str): Parameters ---------- - form_text : str - The text that defines the form within the query. + form_text : str + The text that defines the form within the query. Returns ------- - bool - Whether the form and its current representation label match (repForm and rep). + bool + Whether the form and its current representation label match (repForm and rep). """ form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \." @@ -221,13 +221,13 @@ def check_query_formatting(form_text: str): Parameters ---------- - query_text : str - The SPARQL query text to check. + query_text : str + The SPARQL query text to check. Returns ------- - bool - Whether there are formatting errors with the query. + bool + Whether there are formatting errors with the query. """ # Check for spaces before commas that should not exist. if re.search(r"\s,", form_text): @@ -249,13 +249,13 @@ def return_correct_form_label(qids: list): Parameters ---------- - qids : list[str] - All QIDS that make up the form. + qids : list[str] + All QIDS that make up the form. Returns ------- - correct_label : str - The label for the representation given the QIDs. + correct_label : str + The label for the representation given the QIDs. """ if not qids: return "Invalid query formatting found" @@ -289,14 +289,14 @@ def validate_forms(query_text: str) -> str: Parameters ---------- - query_file : str - The SPARQL query text as a string. + query_file : str + The SPARQL query text as a string. Returns ------- - str - Error message if there are any issues with the order of variables or forms, - otherwise an empty string. + str + Error message if there are any issues with the order of variables or forms, + otherwise an empty string. """ select_pattern = r"SELECT\s+(.*?)\s+WHERE" @@ -376,13 +376,13 @@ def check_docstring(query_text: str) -> bool: Parameters ---------- - query_text : str - The SPARQL query's text to be checked. + query_text : str + The SPARQL query's text to be checked. Returns ------- - bool - True if the docstring is correctly formatted. + bool + True if the docstring is correctly formatted. """ # Split the text into lines. query_lines = query_text.splitlines(keepends=True) @@ -418,14 +418,14 @@ def check_forms_order(query_text): Parameters ---------- - query_text : str - The SPARQL query text containing the SELECT statement with variables. + query_text : str + The SPARQL query text containing the SELECT statement with variables. Returns ------- - list or bool - A sorted list of variables if the ordering differs from the original, - otherwise a boolean indicating that the order matches. + list or bool + A sorted list of variables if the ordering differs from the original, + otherwise a boolean indicating that the order matches. """ select_pattern = r"SELECT\s+(.*?)\s+WHERE" @@ -496,14 +496,14 @@ def check_optional_qid_order(query_file: str) -> str: Parameters ---------- - query_file : str - The path to the SPARQL query file to be checked. + query_file : str + The path to the SPARQL query file to be checked. Returns ------- - str - A formatted string with details on any order mismatches in the QIDs, or an empty - string if all QIDs are correctly ordered. + str + A formatted string with details on any order mismatches in the QIDs, or an empty + string if all QIDs are correctly ordered. """ forms = extract_forms_from_sparql(query_file) error_messages = [] diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 3d1779e7a..5337b972c 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -41,21 +41,21 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: Parameters ---------- - query_file : Path - The path to the SPARQL query file being validated. + query_file : Path + The path to the SPARQL query file being validated. - lang_qid : str - The QID of the language extracted from the SPARQL query. + lang_qid : str + The QID of the language extracted from the SPARQL query. Returns ------- - bool - True if the language QID is valid, otherwise False. + bool + True if the language QID is valid, otherwise False. - Example - ------- - > is_valid_language(Path("path/to/query.sparql"), "Q123456") - True + Examples + -------- + > is_valid_language(Path("path/to/query.sparql"), "Q123456") + True """ lang_directory_name = query_file.parent.parent.name.lower() language_entry = language_metadata.get(lang_directory_name) @@ -79,21 +79,21 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: Parameters ---------- - query_file : Path - The path to the SPARQL query file being validated. + query_file : Path + The path to the SPARQL query file being validated. - data_type_qid : str - The QID of the data type extracted from the SPARQL query. + data_type_qid : str + The QID of the data type extracted from the SPARQL query. Returns ------- - bool - True if the data type QID is valid, otherwise False. + bool + True if the data type QID is valid, otherwise False. - Example - ------- - > is_valid_data_type(Path("path/to/query.sparql"), "Q654321") - True + Examples + -------- + > is_valid_data_type(Path("path/to/query.sparql"), "Q654321") + True """ directory_name = query_file.parent.name # e.g., "nouns" or "verbs" expected_data_type_qid = data_type_metadata.get(directory_name) @@ -107,21 +107,21 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract the QID. + file_path : Path + The path to the SPARQL query file from which to extract the QID. - pattern : str - The regex pattern used to match the QID (either for language or data type). + pattern : str + The regex pattern used to match the QID (either for language or data type). Returns ------- - str - The extracted QID if found, otherwise None. + str + The extracted QID if found, otherwise None. Raises ------ - FileNotFoundError - If the specified file does not exist. + FileNotFoundError + If the specified file does not exist. """ try: with open(file_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 5c8cd14bf..6fa466515 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -34,12 +34,12 @@ def correct_data_type(data_type: str) -> str: Parameters ---------- - data_type : str - The data type to potentially correct. + data_type : str + The data type to potentially correct. Returns ------- - The data_type value or a corrected version of it. + The data_type value or a corrected version of it. """ all_data_types = data_type_metadata.keys() @@ -122,16 +122,16 @@ def validate_language_and_data_type( Parameters ---------- - language : str or list - The language(s) to validate. + language : str or list + The language(s) to validate. - data_type : str or list - The data type(s) to validate. + data_type : str or list + The data type(s) to validate. Raises ------ - ValueError - If any of the languages or data types is invalid, with all errors reported together. + ValueError + If any of the languages or data types is invalid, with all errors reported together. """ def validate_single_item(item, valid_options, item_type): @@ -140,19 +140,19 @@ def validate_single_item(item, valid_options, item_type): Parameters ---------- - item : str - The item to validate. - valid_options : list + item : str + The item to validate. - A list of valid options against which the item will be validated. + valid_options : list + A list of valid options against which the item will be validated. - item_type : str - A description of the item type (e.g., "language", "data-type") used in error messages. + item_type : str + A description of the item type (e.g., "language", "data-type") used in error messages. Returns ------- - str or None - Returns an error message if the item is invalid, or None if the item is valid. + str or None + Returns an error message if the item is invalid, or None if the item is valid. """ if ( isinstance(item, str) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index a2e2f777e..275182444 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -51,30 +51,30 @@ def convert_to_json( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : Union[str, List[str]] - The data type of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. - output_type : str - The output format, should be "json". + output_type : str + The output format, should be "json". - input_file : str - The input CSV/TSV file path. + input_file : str + The input CSV/TSV file path. - output_dir : Path - The output directory path for results. + output_dir : Path + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - None + None """ if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -205,30 +205,30 @@ def convert_to_csv_or_tsv( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : Union[str, List[str]] - The data type of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. - output_type : str - The output format, should be "csv" or "tsv". + output_type : str + The output format, should be "csv" or "tsv". - input_file : str - The input JSON file path. + input_file : str + The input JSON file path. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - None + None """ if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -391,30 +391,30 @@ def convert_to_sqlite( Parameters ---------- - language : str - The language of the file to convert. + language : str + The language of the file to convert. - data_type : str - The data type of the file to convert. + data_type : str + The data type of the file to convert. - output_type : str - The output format, should be "sqlite". + output_type : str + The output format, should be "sqlite". - input_file : Path - The input file path for the data to be converted. + input_file : Path + The input file path for the data to be converted. - output_dir : Path - The output directory path for results. + output_dir : Path + The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + overwrite : bool + Whether to overwrite existing files. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". Returns ------- - A SQLite file saved in the given location. + A SQLite file saved in the given location. """ if input_file: input_file = Path(input_file) @@ -487,7 +487,7 @@ def convert_wrapper( Returns ------- - None + None """ output_type = output_type.lower() diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py index 2f741545a..f7f29adf9 100644 --- a/src/scribe_data/cli/download.py +++ b/src/scribe_data/cli/download.py @@ -27,6 +27,7 @@ from pathlib import Path from typing import Optional +import questionary import requests from rich import print as rprint from tqdm import tqdm @@ -45,13 +46,16 @@ def parse_date(date_string): Parameters ---------- - date_string : str - The date string to be parsed. + date_string : str + The date string to be parsed. Returns ------- - datetime.date : Parsed date object if the format is valid. - None : If the date format is invalid. + datetime.date + Parsed date object if the format is valid. + + None + If the date format is invalid. """ formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"] for fmt in formats: @@ -75,19 +79,22 @@ def available_closest_lexeme_dumpfile( Parameters ---------- - target_entity : str - The target date for which the dump is requested (format: YYYY/MM/DD or similar). + target_entity : str + The target date for which the dump is requested (format: YYYY/MM/DD or similar). - other_old_dumps : list - List of available dump folders as strings. + other_old_dumps : list + List of available dump folders as strings. - check_wd_dump_exists : function - A function to validate if the dump file exists. + check_wd_dump_exists : function + A function to validate if the dump file exists. Returns ------- - str : The closest available dump file date (as a string). - None : If no suitable dump is found. + str + The closest available dump file date (as a string). + + None + If no suitable dump is found. """ target_date = parse_date(target_entity) closest_date = None @@ -121,16 +128,19 @@ def download_wd_lexeme_dump(target_entity: str = "latest-lexemes"): Parameters ---------- - target_entity : str, optional - The target dump to download. Defaults to "latest-lexemes". + target_entity : str, optional + The target dump to download. Defaults to "latest-lexemes". - - If "latest-lexemes", downloads the latest dump. - - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date. + - If "latest-lexemes", downloads the latest dump. + - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date. Returns ------- - str : The URL of the requested or closest available dump. - None : If no suitable dump is found or the request fails. + str + The URL of the requested or closest available dump. + + None + If no suitable dump is found or the request fails. """ base_url = "https://dumps.wikimedia.org/wikidatawiki/entities" @@ -218,12 +228,12 @@ def wd_lexeme_dump_download_wrapper( Parameters ---------- - wikidata_dump : str - Optional date string in YYYYMMDD format for specific dumps. + wikidata_dump : str + Optional date string in YYYYMMDD format for specific dumps. - output_dir : str - Optional directory path for the downloaded file. - Defaults to 'scribe_data_wikidata_dumps_export' directory. + output_dir : str + Optional directory path for the downloaded file. + Defaults to 'scribe_data_wikidata_dumps_export' directory. """ dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes") @@ -244,16 +254,12 @@ def wd_lexeme_dump_download_wrapper( filename = dump_url.split("/")[-1] output_path = str(Path(output_dir) / filename) - user_response = ( - input( - "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities." - "\nDo you want to proceed? (y/n): " - ) - .strip() - .lower() - ) + user_response = questionary.confirm( + "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities. Do you want to proceed?", + default=True, + ).ask() - if user_response == "y": + if user_response: rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]") response = requests.get(dump_url, stream=True) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 771774aa4..3e2f38972 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -24,6 +24,7 @@ from pathlib import Path from typing import List, Union +import questionary from rich import print as rprint from scribe_data.cli.convert import convert_wrapper @@ -55,39 +56,39 @@ def get_data( Parameters ---------- - language : str - The language(s) to get. + language : str + The language(s) to get. - data_type : str - The data type(s) to get. + data_type : str + The data type(s) to get. - output_type : str - The output file type. + output_type : str + The output file type. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - outputs_per_entry : str - How many outputs should be generated per data entry. + outputs_per_entry : str + How many outputs should be generated per data entry. - overwrite : bool (default: False) - Whether to overwrite existing files. + overwrite : bool (default: False) + Whether to overwrite existing files. - all_bool : bool - Get all languages and data types. + all_bool : bool + Get all languages and data types. - interactive : bool (default: False) - Whether it's running in interactive mode. + interactive : bool (default: False) + Whether it's running in interactive mode. - identifier_case : str - The case format for identifiers. Default is "camel". + identifier_case : str + The case format for identifiers. Default is "camel". - wikidata_dump : str - The local Wikidata dump that can be used to process data. + wikidata_dump : str + The local Wikidata lexeme dump that can be used to process data. Returns ------- - The requested data saved locally given file type and location arguments. + The requested data saved locally given file type and location arguments. """ # MARK: Defaults @@ -108,16 +109,20 @@ def prompt_user_download_all(): """ Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call. """ - download_all_input = input( - "Do you want to query Wikidata, or would you rather use Wikidata lexeme dumps? (y/N): " - ) - return download_all_input == "y" + return questionary.confirm( + "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)", + default=False, + ).ask() if all_bool: if language: if prompt_user_download_all(): - parse_wd_lexeme_dump() - + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["form"], + data_types=data_types, + type_output_dir=output_dir, + ) else: language_or_sub_language = language.split(" ")[0] print(f"Updating all data types for language: {language.title()}") @@ -133,8 +138,12 @@ def prompt_user_download_all(): elif data_type: if prompt_user_download_all(): - parse_wd_lexeme_dump() - + parse_wd_lexeme_dump( + language=None, + wikidata_dump_type=["form"], + data_types=[data_type], + type_output_dir=output_dir, + ) else: print(f"Updating all languages for data type: {data_type.capitalize()}") query_data( @@ -150,15 +159,46 @@ def prompt_user_download_all(): else: print("Updating all languages and data types...") rprint( - "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]" + "[bold red]Note that the download all functionality must use Wikidata lexeme dumps to observe responsible Wikidata Query Service usage practices.[/bold red]" + ) + parse_wd_lexeme_dump( + language="all", + wikidata_dump_type=["form", "translations"], + data_types="all", + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, ) - parse_wd_lexeme_dump() # MARK: Emojis elif data_type in {"emoji-keywords", "emoji_keywords"}: generate_emoji(language=language, output_dir=output_dir) + # MARK: Translations + + elif data_type == "translations": + if language is None: + language = "all" + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["translations"], + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + ) + return + + # MARK: Form Dump + + elif wikidata_dump: + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["form"], + data_types=data_types, + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + ) + return + # MARK: Query Data elif language or data_type: diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index d3e8dd1db..5e5dec74b 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -27,7 +27,6 @@ import questionary from prompt_toolkit import prompt from prompt_toolkit.completion import WordCompleter -from questionary import Choice from rich import print as rprint from rich.console import Console from rich.logging import RichHandler @@ -37,13 +36,14 @@ # from scribe_data.cli.list import list_wrapper from scribe_data.cli.get import get_data from scribe_data.cli.total import total_wrapper -from scribe_data.cli.version import get_local_version from scribe_data.utils import ( + DEFAULT_DUMP_EXPORT_DIR, DEFAULT_JSON_EXPORT_DIR, data_type_metadata, language_metadata, list_all_languages, ) +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump # MARK: Config Setup @@ -260,9 +260,12 @@ def request_total_lexeme_loop(): choice = questionary.select( "What would you like to do?", choices=[ - Choice("Configure total lexemes request", "total"), - Choice("Run total lexemes request", "run"), - Choice("Exit", "exit"), + questionary.Choice("Configure total lexemes request", "total"), + questionary.Choice("Run total lexemes request", "run"), + questionary.Choice( + "Run total lexemes request with lexeme dumps", "run_all" + ), + questionary.Choice("Exit", "exit"), ], ).ask() @@ -275,6 +278,18 @@ def request_total_lexeme_loop(): config.selected_languages, config.selected_data_types = [], [] rprint(THANK_YOU_MESSAGE) break + elif choice == "run_all": + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + parse_wd_lexeme_dump( + language=config.selected_languages, + wikidata_dump_type=["total"], + wikidata_dump_path=wikidata_dump_path, + ) + break elif choice == "exit": return else: @@ -289,7 +304,7 @@ def request_total_lexeme_loop(): # See list of languages. # """ -# choice = questionary.select( +# choice = select( # "What would you like to list?", # choices=[ # Choice("All languages", "all_languages"), @@ -313,37 +328,46 @@ def start_interactive_mode(operation: str = None): Parameters ---------- - operation : str - The type of operation that interactive mode is being ran with. + operation : str + The type of operation that interactive mode is being ran with. """ - rprint(f"[bold cyan]Welcome to {get_local_version()} interactive mode![/bold cyan]") while True: # Check if both selected_languages and selected_data_types are empty. if not config.selected_languages and not config.selected_data_types: if operation == "get": choices = [ - Choice("Configure get data request", "configure"), + questionary.Choice("Configure get data request", "configure"), # Choice("See list of languages", "languages"), - Choice("Exit", "exit"), + questionary.Choice("Exit", "exit"), ] elif operation == "total": choices = [ - Choice("Configure total lexemes request", "total"), + questionary.Choice("Configure total lexemes request", "total"), # Choice("See list of languages", "languages"), - Choice("Exit", "exit"), + questionary.Choice("Exit", "exit"), + ] + elif operation == "translations": + choices = [ + questionary.Choice( + "Configure translations request", "translations" + ), + # Choice("See list of languages", "languages"), + questionary.Choice("Exit", "exit"), ] else: choices = [ - Choice("Configure get data request", "configure"), - Choice("Exit", "exit"), + questionary.Choice("Configure get data request", "configure"), + questionary.Choice("Exit", "exit"), ] if config.configured: - choices.insert(1, Choice("Request for get data", "run")) + choices.insert(1, questionary.Choice("Request for get data", "run")) else: - choices.insert(1, Choice("Request for total lexeme", "total")) + choices.insert( + 1, questionary.Choice("Request for total lexeme", "total") + ) choice = questionary.select("What would you like to do?", choices=choices).ask() @@ -356,6 +380,29 @@ def start_interactive_mode(operation: str = None): request_total_lexeme_loop() break + elif choice == "translations": + prompt_for_languages() + + if wikidata_dump_path := prompt( + f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): " + ): + wikidata_dump_path = Path(wikidata_dump_path) + + if output_dir := prompt( + f"Enter output directory (default: {config.output_dir}): " + ): + config.output_dir = Path(output_dir) + + parse_wd_lexeme_dump( + language=config.selected_languages, + wikidata_dump_type=["translations"], + data_types=None, + type_output_dir=config.output_dir, + wikidata_dump_path=wikidata_dump_path, + ) + + break + # elif choice == "languages": # see_list_languages() # break diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index a2aebfce6..72175879f 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -70,8 +70,8 @@ def list_data_types(language: str = None) -> None: Parameters ---------- - language : str - The language to potentially list data types for. + language : str + The language to potentially list data types for. """ languages = list_all_languages(language_metadata) if language: @@ -142,8 +142,8 @@ def list_languages_for_data_type(data_type: str) -> None: Parameters ---------- - data_type : str - The data type to check for. + data_type : str + The data type to check for. """ data_type = correct_data_type(data_type=data_type) all_languages = list_languages_with_metadata_for_data_type(language_metadata) @@ -179,14 +179,14 @@ def list_wrapper( Parameters ---------- - language : str - The language to potentially list data types for. + language : str + The language to potentially list data types for. - data_type : str - The data type to check for. + data_type : str + The data type to check for. - all_bool : boolean - Whether all languages and data types should be listed. + all_bool : boolean + Whether all languages and data types should be listed. """ if (not language and not data_type) or all_bool: list_all() diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index beeef0caf..e22f4aead 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -24,6 +24,7 @@ import argparse from pathlib import Path +from questionary import select from rich import print as rprint from scribe_data.cli.cli_utils import validate_language_and_data_type @@ -35,6 +36,7 @@ from scribe_data.cli.total import total_wrapper from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message +from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." GET_DESCRIPTION = ( @@ -167,6 +169,9 @@ def main() -> None: type=str, help="Path to a local Wikidata lexemes dump for running with '--all'.", ) + get_parser.add_argument( + "-t", "--translation", type=str, help="parse a single word using MediaWiki API" + ) # MARK: Total @@ -200,7 +205,8 @@ def main() -> None: total_parser.add_argument( "-wdp", "--wikidata-dump-path", - type=str, + nargs="?", + const=True, help="Path to a local Wikidata lexemes dump for running with '--all'.", ) @@ -284,8 +290,8 @@ def main() -> None: download_parser = subparsers.add_parser( "download", aliases=["d"], - help="Download Wikidata dumps.", - description="Download Wikidata dumps from dumps.wikimedia.org.", + help="Download Wikidata lexeme dumps.", + description="Download Wikidata lexeme dumps from dumps.wikimedia.org.", epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) @@ -295,15 +301,25 @@ def main() -> None: "--wikidata-dump-version", nargs="?", const="latest", - help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.", + help="Download Wikidata lexeme dump. Optionally specify date in YYYYMMDD format.", ) download_parser.add_argument( - "-od", - "--output-dir", + "-wdp", + "--wikidata-dump-path", type=str, help="The output directory path for the downloaded dump.", ) + # MARK: Interactive + + interactive_parser = subparsers.add_parser( + "interactive", + aliases=["i"], + help="Run in interactive mode.", + description="Run in interactive mode.", + ) + interactive_parser._actions[0].help = "Show this help message and exit." + # MARK: Setup CLI args = parser.parse_args() @@ -347,7 +363,8 @@ def main() -> None: elif args.command in ["get", "g"]: if args.interactive: start_interactive_mode(operation="get") - + if args.translation: + parse_wiktionary_translations(args.translation) else: get_data( language=args.language.lower() @@ -400,9 +417,39 @@ def main() -> None: wikidata_dump=args.wikidata_dump_version if args.wikidata_dump_version != "latest" else None, - output_dir=args.output_dir, + output_dir=args.wikidata_dump_path, ) + elif args.command in ["interactive", "i"]: + rprint( + f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]" + ) + action = select( + "What would you like to do?", + choices=[ + "Download a Wikidata lexemes dump", + "Check for totals", + "Get data", + "Get translations", + "Exit", + ], + ).ask() + + if action == "Download a Wikidata lexemes dump": + wd_lexeme_dump_download_wrapper() + + elif action == "Check for totals": + start_interactive_mode(operation="total") + + elif action == "Get data": + start_interactive_mode(operation="get") + + elif action == "Get translations": + start_interactive_mode(operation="translations") + + else: + print("Skipping action") + else: parser.print_help() diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 71881ddad..8d86d7fe7 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -35,7 +35,7 @@ language_to_qid, list_all_languages, ) -from scribe_data.wikidata.wikidata_utils import sparql +from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql def get_qid_by_input(input_str): @@ -44,13 +44,13 @@ def get_qid_by_input(input_str): Parameters ---------- - input_str : str - The input string representing a language or data type. + input_str : str + The input string representing a language or data type. Returns ------- - str or None - The QID corresponding to the input string, or- None if not found. + str or None + The QID corresponding to the input string, or- None if not found. """ if input_str: if input_str in language_to_qid: @@ -68,13 +68,13 @@ def get_datatype_list(language): Parameters ---------- - language : str - The language to return data types for. + language : str + The language to return data types for. Returns ------- - data_types : list[str] or None - A list of the corresponding data types. + data_types : list[str] or None + A list of the corresponding data types. """ language_key = language.strip().lower() # normalize input languages = list_all_languages(language_metadata) @@ -128,18 +128,18 @@ def check_qid_is_language(qid: str): """ Parameters ---------- - qid : str - The QID to check Wikidata to see if it's a language and return its English label. + qid : str + The QID to check Wikidata to see if it's a language and return its English label. Outputs ------- - str - The English label of the Wikidata language entity. + str + The English label of the Wikidata language entity. Raises ------ - ValueError - An invalid QID that's not a language has been passed. + ValueError + An invalid QID that's not a language has been passed. """ api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0" request_string = f"{api_endpoint}/entities/items/{qid}" @@ -166,13 +166,13 @@ def print_total_lexemes(language: str = None): Parameters ---------- - language : str (Default=None) - The language to display data type entity counts for. + language : str (Default=None) + The language to display data type entity counts for. Outputs ------- - str - A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. + str + A formatted string indicating the language, data type, and total number of lexemes for all the languages, if found. """ if language is None: print("Returning total counts for all languages and data types...\n") @@ -370,7 +370,7 @@ def total_wrapper( language: Union[str, List[str]] = None, data_type: Union[str, List[str]] = None, all_bool: bool = False, - wikidata_dump: str = None, + wikidata_dump: Union[str, bool] = None, ) -> None: """ Conditionally provides the full functionality of the total command. @@ -378,18 +378,38 @@ def total_wrapper( Parameters ---------- - language : Union[str, List[str]] - The language(s) to potentially total data types for. + language : Union[str, List[str]] + The language(s) to potentially total data types for. - data_type : Union[str, List[str]] - The data type(s) to check for. + data_type : Union[str, List[str]] + The data type(s) to check for. - all_bool : boolean - Whether all languages and data types should be listed. + all_bool : boolean + Whether all languages and data types should be listed. - wikidata_dump : str - The local Wikidata dump that can be used to process data. + wikidata_dump : Union[str, bool] + The local Wikidata lexeme dump path that can be used to process data. + If True, indicates the flag was used without a path. """ + # Handle --all flag + if all_bool and wikidata_dump: + language = "all" + + if wikidata_dump is True: # flag without a wikidata lexeme dump path + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + return + + if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path + parse_wd_lexeme_dump( + language=language, + wikidata_dump_type=["total"], + wikidata_dump_path=wikidata_dump, + ) + return if (not language and not data_type) and all_bool: print_total_lexemes() diff --git a/src/scribe_data/resources/data_type_metadata.json b/src/scribe_data/resources/data_type_metadata.json index ff6249f10..4800b0e9a 100644 --- a/src/scribe_data/resources/data_type_metadata.json +++ b/src/scribe_data/resources/data_type_metadata.json @@ -11,5 +11,6 @@ "prepositions": "Q4833830", "pronouns": "Q36224", "proper_nouns": "Q147276", + "translations": "Q21112633", "verbs": "Q24905" } diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index 2661f48d8..1d33b1587 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -44,16 +44,17 @@ def generate_emoji(language, output_dir: str = None): Parameters ---------- - language : str - The ISO code of the language for which to generate emoji keywords. + language : str + The ISO code of the language for which to generate emoji keywords. - output_dir : str, optional - The directory where the generated data will be saved. - If not specified, the data will be saved in a default directory. + output_dir : str, optional + The directory where the generated data will be saved. + If not specified, the data will be saved in a default directory. Returns ------- - None: The function does not return any value but outputs data to the specified directory. + None + The function does not return any value but outputs data to the specified directory. """ if check_and_install_pyicu() and check_if_pyicu_installed() is False: print("Thank you.") diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index abdf23634..eb3738620 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -57,15 +57,15 @@ def gen_emoji_lexicon( Parameters ---------- - language : string (default=None) - The language keywords are being generated for. + language : string (default=None) + The language keywords are being generated for. - emojis_per_keyword : int (default=None) - The limit for number of emoji keywords that should be generated per keyword. + emojis_per_keyword : int (default=None) + The limit for number of emoji keywords that should be generated per keyword. Returns ------- - Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps. + Keywords dictionary for emoji keywords-to-unicode are saved locally or uploaded to Scribe apps. """ if not icu_installed: raise ImportError("Could not import required PyICU functionality.") diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 933c76231..311478bc2 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -22,6 +22,7 @@ """ import ast +import contextlib import json import os import re @@ -30,6 +31,7 @@ from pathlib import Path from typing import Any, Optional +import questionary from rich import print as rprint # MARK: Utils Variables @@ -84,7 +86,6 @@ for lang, lang_data in language_metadata.items(): if "sub_languages" in lang_data: for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_lower = sub_lang sub_qid = sub_lang_data.get("qid") if sub_qid is None: @@ -93,8 +94,8 @@ ) else: - language_map[sub_lang_lower] = sub_lang_data - language_to_qid[sub_lang_lower] = sub_qid + language_map[sub_lang] = sub_lang_data + language_to_qid[sub_lang] = sub_qid else: qid = lang_data.get("qid") @@ -112,15 +113,15 @@ def _load_json(package_path: str, file_name: str) -> Any: Parameters ---------- - package_path : str - The fully qualified package that contains the resource. + package_path : str + The fully qualified package that contains the resource. - file_name : str - The name of the file (resource) that contains the JSON data. + file_name : str + The name of the file (resource) that contains the JSON data. Returns ------- - A python entity representing the JSON content. + A python entity representing the JSON content. """ with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" @@ -141,25 +142,26 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - Parameters ---------- - source_value : str - The source value to find equivalents for (e.g., 'english', 'nynorsk'). + source_value : str + The source value to find equivalents for (e.g., 'english', 'nynorsk'). - source_key : str - The source key to reference (e.g., 'language'). + source_key : str + The source key to reference (e.g., 'language'). - target_key : str - The key to target (e.g., 'qid'). + target_key : str + The key to target (e.g., 'qid'). - error_msg : str - The message displayed when a value cannot be found. + error_msg : str + The message displayed when a value cannot be found. Returns ------- - The 'target' value given the passed arguments. + The 'target' value given the passed arguments. Raises ------ - ValueError : when a source_value is not supported or the language only has sub-languages. + ValueError + When a source_value is not supported or the language only has sub-languages. """ # Check if we're searching by language name. if source_key == "language": @@ -193,13 +195,13 @@ def get_language_qid(language: str) -> str: Parameters ---------- - language : str - The language the QID should be returned for. + language : str + The language the QID should be returned for. Returns ------- - str - The Wikidata QID for the language. + str + The Wikidata QID for the language. """ return _find( source_key="language", @@ -215,13 +217,13 @@ def get_language_iso(language: str) -> str: Parameters ---------- - language : str - The language the ISO should be returned for. + language : str + The language the ISO should be returned for. Returns ------- - str - The ISO code for the language. + str + The ISO code for the language. """ return _find( @@ -238,13 +240,13 @@ def get_language_from_iso(iso: str) -> str: Parameters ---------- - iso : str - The ISO the language name should be returned for. + iso : str + The ISO the language name should be returned for. Returns ------- - str - The name for the language which has an ISO value of iso. + str + The name for the language which has an ISO value of iso. """ # Iterate over the languages and their properties. for language, properties in _languages.items(): @@ -270,19 +272,19 @@ def load_queried_data( Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - tuple(Any, str) - A tuple containing the loaded data and the path to the data file. + tuple(Any, str) + A tuple containing the loaded data and the path to the data file. """ data_path = ( Path(dir_path) / language.lower().replace(" ", "_") / f"{data_type}.json" @@ -298,18 +300,18 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None: Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - None : The file is deleted. + None : The file is deleted. """ data_path = ( Path(dir_path) @@ -317,12 +319,9 @@ def remove_queried_data(dir_path: str, language: str, data_type: str) -> None: / f"{data_type}_queried.json" ) - try: + with contextlib.suppress(OSError): os.remove(data_path) - except OSError: - pass - def export_formatted_data( dir_path: str, @@ -336,21 +335,21 @@ def export_formatted_data( Parameters ---------- - dir_path : str - The path to the directory containing the queried data. + dir_path : str + The path to the directory containing the queried data. - formatted_data : dict - The data to be exported. + formatted_data : dict + The data to be exported. - language : str - The language for which the data is being exported. + language : str + The language for which the data is being exported. - data_type : str - The type of data being exported (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being exported (e.g. 'nouns', 'verbs'). Returns ------- - None + None """ export_path = ( Path(dir_path) @@ -373,13 +372,13 @@ def get_ios_data_path(language: str) -> str: Parameters ---------- - language : str - The language the path should be returned for. + language : str + The language the path should be returned for. Returns ------- - str - The path to the language folder for the given language. + str + The path to the language folder for the given language. """ return Path("Scribe-iOS") / "Keyboards" / "LanguageKeyboards" / f"{language}" @@ -390,13 +389,13 @@ def get_android_data_path() -> str: Parameters ---------- - language : str - The language the path should be returned for. + language : str + The language the path should be returned for. Returns ------- - str - The path to the assets data folder for the application. + str + The path to the assets data folder for the application. """ return Path("Scribe-Android") / "app" / "src" / "main" / "assets" / "data" @@ -409,19 +408,19 @@ def check_command_line_args( Parameters ---------- - file_name : str - The name of the file for clear error outputs if necessary. + file_name : str + The name of the file for clear error outputs if necessary. - passed_values : UNKNOWN (will be checked) - An argument to be checked against known values. + passed_values : UNKNOWN (will be checked) + An argument to be checked against known values. - values_to_check : list(str) - The values that should be checked against. + values_to_check : list(str) + The values that should be checked against. Returns ------- - args: list(str) - The arguments or an error are returned depending on if they're correct. + args: list(str) + The arguments or an error are returned depending on if they're correct. """ try: args = ast.literal_eval(passed_values) @@ -464,19 +463,19 @@ def check_and_return_command_line_args( Parameters ---------- - all_args : list[str] - The arguments passed to the Scribe-Data file. + all_args : list[str] + The arguments passed to the Scribe-Data file. - first_args_check : list[str] - The values that the first argument should be checked against. + first_args_check : list[str] + The values that the first argument should be checked against. - second_args_check : list[str] - The values that the second argument should be checked against. + second_args_check : list[str] + The values that the second argument should be checked against. Returns ------- - first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]] - The subset of possible first and second arguments that have been verified as being valid. + first_args, second_args: Tuple[Optional[list[str]], Optional[list[str]]] + The subset of possible first and second arguments that have been verified as being valid. """ if len(all_args) == 1: return None, None @@ -521,29 +520,30 @@ def format_sublanguage_name(lang, language_metadata=_languages): Parameters ---------- - lang : str - The name of the language or sub-language to format. + lang : str + The name of the language or sub-language to format. - language_metadata : dict - The metadata containing information about main languages and their sub-languages. + language_metadata : dict + The metadata containing information about main languages and their sub-languages. Returns ------- - str - The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian'). - Otherwise the original name. + str + The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian'). + Otherwise the original name. Raises ------ - ValueError: If the provided language or sub-language is not found. + ValueError + If the provided language or sub-language is not found. - Example - ------- - > format_sublanguage_name("nynorsk", language_metadata) - 'Nynorsk Norwegian' + Examples + -------- + > format_sublanguage_name("nynorsk", language_metadata) + 'Nynorsk Norwegian' - > format_sublanguage_name("english", language_metadata) - 'English' + > format_sublanguage_name("english", language_metadata) + 'English' """ for main_lang, lang_data in language_metadata.items(): # If it's not a sub-language, return the original name. @@ -596,14 +596,15 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages): # Check if there are sub-languages. if "sub_languages" in lang_data: # Add the sub-languages to current_languages with metadata. - for sub_key, sub_data in lang_data["sub_languages"].items(): - current_languages.append( - { - "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}", - "iso": sub_data.get("iso", ""), - "qid": sub_data.get("qid", ""), - } - ) + current_languages.extend( + { + "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}", + "iso": sub_data.get("iso", ""), + "qid": sub_data.get("qid", ""), + } + for sub_key, sub_data in lang_data["sub_languages"].items() + ) + else: # If no sub-languages, add the main language with metadata. current_languages.append( @@ -636,12 +637,12 @@ def check_lexeme_dump_prompt_download(output_dir: str): Parameters ---------- - output_dir : str - The directory to check for the existence of a Wikidata lexeme dump. + output_dir : str + The directory to check for the existence of a Wikidata lexeme dump. Returns ------- - None : The user is prompted to download a new Wikidata dump after the existence of one is checked. + None : The user is prompted to download a new Wikidata lexeme dump after the existence of one is checked. """ existing_dumps = list(Path(output_dir).glob("*.json.bz2")) if existing_dumps: @@ -649,19 +650,27 @@ def check_lexeme_dump_prompt_download(output_dir: str): for dump in existing_dumps: rprint(f" - {Path(output_dir)}/{dump.name}") - user_input = input( - "\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n - Download new version(n)?\n[d/s/u/n]: " - ).lower() - - if user_input == "d": + user_input = questionary.select( + "Do you want to:", + choices=[ + "Delete existing dumps", + "Skip download", + "Use existing latest dump", + "Download new version", + ], + ).ask() + + if user_input == "Delete existing dumps": for dump in existing_dumps: dump.unlink() rprint("[bold green]Existing dumps deleted.[/bold green]") - user_input = input("Do you want to download latest lexeme dump? (y/N): ") - return user_input != "y" + download_input = questionary.select( + "Do you want to download the latest lexeme dump?", choices=["Yes", "No"] + ).ask() + return download_input != "Yes" - elif user_input == "u": + elif user_input == "Use existing latest dump": # Check for the latest dump file. latest_dump = None if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps): @@ -685,7 +694,6 @@ def check_lexeme_dump_prompt_download(output_dir: str): latest_dump = max(dated_dumps, key=lambda x: x[1])[0] if latest_dump: - rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}") return latest_dump else: @@ -695,3 +703,34 @@ def check_lexeme_dump_prompt_download(output_dir: str): else: rprint("[bold blue]Skipping download.[/bold blue]") return True + + +def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: + """ + Check if JSON wiktionary dump file exists and prompt user for action if it does. + Returns True if user chooses to skip (i.e., we do NOT proceed). + Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed). + + Parameters + ---------- + index_path : pathlib.Path + The path to check. + + overwrite_all : cool (default=False) + If True, automatically overwrite without prompting. + """ + if index_path.exists(): + if overwrite_all: + return False + + print(f"\nIndex file already exists at: {index_path}") + choice = questionary.select( + "Choose an action:", + choices=["Overwrite existing data", "Skip process"], + default="Skip process", + ).ask() + + # If user selects "Skip process", return True meaning "don't proceed". + return choice == "Skip process" + + return False diff --git a/src/scribe_data/wikidata/check_query/check.py b/src/scribe_data/wikidata/check_query/check.py index 41f1706af..955168b5a 100644 --- a/src/scribe_data/wikidata/check_query/check.py +++ b/src/scribe_data/wikidata/check_query/check.py @@ -49,15 +49,15 @@ def ping(url: str, timeout: int) -> bool: Parameters ---------- - url : str - The URL to test. + url : str + The URL to test. - timeout : int - The maximum number of seconds to wait for a reply. + timeout : int + The maximum number of seconds to wait for a reply. Returns ------- - bool : True if connectivity is established or False otherwise. + bool : True if connectivity is established or False otherwise. """ try: with urllib.request.urlopen(url, timeout=timeout) as response: @@ -132,12 +132,12 @@ def check_sparql_file(fpath: str) -> Path: Parameters ---------- - fpath : str - The file to validate. + fpath : str + The file to validate. Returns ------- - Path : the validated file. + Path : the validated file. """ path = Path(fpath) @@ -156,19 +156,20 @@ def check_positive_int(value: str, err_msg: str) -> int: Parameters ---------- - value : str - The value to be validated. + value : str + The value to be validated. - err_msg : str - Used when value fails validation. + err_msg : str + Used when value fails validation. Returns ------- - int : the validated number. + int + The validated number. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ with contextlib.suppress(ValueError): number = int(value) @@ -184,16 +185,17 @@ def check_limit(limit: str) -> int: Parameters ---------- - limit : str - The LIMIT to be validated. + limit : str + The LIMIT to be validated. Returns ------- - int : the validated LIMIT. + int + The validated LIMIT. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ return check_positive_int(limit, "LIMIT must be an integer of value 1 or greater.") @@ -204,16 +206,17 @@ def check_timeout(timeout: str) -> int: Parameters ---------- - timeout : str - The timeout to be validated. + timeout : str + The timeout to be validated. Returns ------- - int : the validated timeout. + int + The validated timeout. Raises ------ - argparse.ArgumentTypeError + argparse.ArgumentTypeError """ return check_positive_int( timeout, "timeout must be an integer of value 1 or greater." @@ -226,12 +229,13 @@ def main(argv=None) -> int: Parameters ---------- - argv (default=None) - If set to None then argparse will use sys.argv as the arguments. + argv (default=None) + If set to None then argparse will use sys.argv as the arguments. Returns -------- - int : the exit status - 0 - success; any other value - failure. + int + The exit status - 0 - success; any other value - failure. """ cli = argparse.ArgumentParser( description=f"run SPARQL queries from the '{PROJECT_ROOT}' project", @@ -356,7 +360,8 @@ def error_report(failures: list[QueryExecutionException]) -> None: Parameters ---------- - failures (list[QueryExecutionException]) : failed queries. + failures : list[QueryExecutionException] + Failed queries. """ if not failures: return @@ -373,11 +378,11 @@ def success_report(successes: list[tuple[QueryFile, dict]], display: bool) -> No Parameters ---------- - successes : list[tuple[QueryFile, dict]] - Successful queries. + successes : list[tuple[QueryFile, dict]] + Successful queries. - display : bool - Whether there should be an output or not. + display : bool + Whether there should be an output or not. """ if not (display and successes): return diff --git a/src/scribe_data/wikidata/check_query/sparql.py b/src/scribe_data/wikidata/check_query/sparql.py index f702907f8..b3c43d27c 100644 --- a/src/scribe_data/wikidata/check_query/sparql.py +++ b/src/scribe_data/wikidata/check_query/sparql.py @@ -39,12 +39,13 @@ def sparql_context(url: str) -> SPARQL.SPARQLWrapper: Parameters ---------- - url : str - A valid URL of a SPARQL endpoint. + url : str + A valid URL of a SPARQL endpoint. Returns ------- - SPARQLWrapper : the context. + SPARQLWrapper + The context. """ context = SPARQL.SPARQLWrapper(url) context.setReturnFormat(SPARQL.JSON) @@ -61,21 +62,22 @@ def execute( Parameters ---------- - query : QueryFile - The SPARQL query to run. + query : QueryFile + The SPARQL query to run. - limit : int - The maximum number of results a query should return. + limit : int + The maximum number of results a query should return. - context : SPARQLWrapper - The SPARQL context. + context : SPARQLWrapper + The SPARQL context. - tries : int - The maximum number of times the query should be executed after failure. + tries : int + The maximum number of times the query should be executed after failure. Returns ------- - dict : the results of the query. + dict + The results of the query. """ def delay_in_seconds() -> int: diff --git a/src/scribe_data/wikidata/format_data.py b/src/scribe_data/wikidata/format_data.py index 68186dbe9..2aa2db970 100644 --- a/src/scribe_data/wikidata/format_data.py +++ b/src/scribe_data/wikidata/format_data.py @@ -46,18 +46,18 @@ def format_data( Parameters ---------- - dir_path : str - The output directory path for results. + dir_path : str + The output directory path for results. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns _______ - A saved and formatted data file for the given language and data type. + A saved and formatted data file for the given language and data type. """ data_list, data_path = load_queried_data( dir_path=dir_path, language=language, data_type=data_type diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index e23be51ee..bbe7c7b53 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -47,18 +47,18 @@ def execute_formatting_script(output_dir: str, language: str, data_type: str): Parameters ---------- - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - language : str - The language for which the data is being loaded. + language : str + The language for which the data is being loaded. - data_type : str - The type of data being loaded (e.g. 'nouns', 'verbs'). + data_type : str + The type of data being loaded (e.g. 'nouns', 'verbs'). Returns ------- - The results of the formatting script saved in the given output directory. + The results of the formatting script saved in the given output directory. """ formatting_file_path = Path(__file__).parent / "format_data.py" @@ -108,21 +108,21 @@ def query_data( Parameters ---------- - language : str - The language(s) to get. + language : str + The language(s) to get. - data_type : str - The data type(s) to get. + data_type : str + The data type(s) to get. - output_dir : str - The output directory path for results. + output_dir : str + The output directory path for results. - overwrite : bool (default: False) - Whether to overwrite existing files. + overwrite : bool (default: False) + Whether to overwrite existing files. Returns ------- - Formatted data from Wikidata saved in the output directory. + Formatted data from Wikidata saved in the output directory. """ current_languages = list_all_languages(language_metadata) current_data_type = ["nouns", "verbs", "prepositions"] diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index d0fbcc6b7..291820708 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -20,39 +20,98 @@ --> """ +from pathlib import Path +from typing import List, Union + +import requests from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper from scribe_data.cli.download import wd_lexeme_dump_download_wrapper +from scribe_data.utils import data_type_metadata, language_metadata +from scribe_data.wiktionary.parse_dump import parse_dump sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(JSON) sparql.setMethod(POST) -def parse_wd_lexeme_dump(wikidata_dump: str = None): +def mediaWiki_query(query: str) -> dict: """ - Checks for the existence of a Wikidata dump and parses it if possible. + Query the Wikidata API using a MediaWiki query. Parameters ---------- - wikidata_dump : str - The local Wikidata dump that should be used to get data. + query : str + The MediaWiki query to execute. Returns ------- - The requested data saved locally given file type and location arguments. + dict + The JSON response from the API. + """ + url = ( + f"https://en.wiktionary.org/w/api.php?" + f"action=query&format=json&titles={query}/translations&prop=revisions&rvprop=content" + ) + response = requests.get(url) + return response.json() + + +def parse_wd_lexeme_dump( + language: Union[str, List[str]] = None, + wikidata_dump_type: List[str] = None, + data_types: List[str] = None, + type_output_dir: str = None, + wikidata_dump_path: str = None, +): """ - if wikidata_dump: - wd_lexeme_dump_download_wrapper(None, wikidata_dump) + Checks for the existence of a Wikidata lexeme dump and parses it if possible. + + Parameters + ---------- + language : Union[str, List[str]] + The language(s) to parse the data for. Use "all" for all languages. + + wikidata_dump_type : List[str] + The type(s) of Wikidata lexeme dump to parse (e.g. ["total", "translations", "form"]). - else: - file_path = wd_lexeme_dump_download_wrapper() - if isinstance(file_path, str) and file_path: + data_types : List[str] + The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]). + + type_output_dir : str, optional + The directory to save the parsed JSON data. If None, uses default directory. + + wikidata_dump_path : str, optional + The local Wikidata lexeme dump directory that should be used to get data. + """ + # Convert "all" to list of all languages + if isinstance(language, str) and language.lower() == "all": + language = list(language_metadata.keys()) + if isinstance(data_types, str) and data_types.lower() == "all": + # Exclude translations as it's a separate section + data_types = [ + dt + for dt in data_type_metadata.keys() + if dt != "translations" and dt != "emoji-keywords" + ] + + file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) + + if isinstance(file_path, (str, Path)): + path = Path(file_path) + if path.exists(): rprint( "[bold green]We'll use the following lexeme dump[/bold green]", file_path, ) - rprint( - "[bold red]Parsing Wikidata lexeme dump feature will be available soon...[/bold red]" + parse_dump( + language=language, + parse_type=wikidata_dump_type, + data_types=data_types, + file_path=file_path, + output_dir=type_output_dir, ) + return + + rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]") diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index 37482beeb..c4b8b4507 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -47,24 +47,24 @@ def download_wiki(language="en", target_dir="wiki_dump", file_limit=None, dump_i Parameters ---------- - language : str (default=en) - The language of Wikipedia to download. + language : str (default=en) + The language of Wikipedia to download. - target_dir : pathlib.Path (default=wiki_dump) - The directory in the pwd into which files should be downloaded. + target_dir : pathlib.Path (default=wiki_dump) + The directory in the pwd into which files should be downloaded. - file_limit : int (default=None, all files) - The limit for the number of files to download. + file_limit : int (default=None, all files) + The limit for the number of files to download. - dump_id : str (default=None) - The id of an explicit Wikipedia dump that the user wants to download. + dump_id : str (default=None) + The id of an explicit Wikipedia dump that the user wants to download. - Note: a value of None will select the third from the last (latest stable dump). + Note: a value of None will select the third from the last (latest stable dump). Returns ------- - file_info : list of lists - Information on the downloaded Wikipedia dump files. + file_info : list of lists + Information on the downloaded Wikipedia dump files. """ if file_limit is not None: assert isinstance( @@ -148,16 +148,16 @@ def _process_article(title, text): Parameters ---------- - title : str - The title of the article. + title : str + The title of the article. - text : str - The text to be processed. + text : str + The text to be processed. Returns ------- - title, text: string, string - The data from the article. + title, text: string, string + The data from the article. """ wikicode = mwparserfromhell.parse(text) @@ -173,24 +173,24 @@ def iterate_and_parse_file(args): Parameters ---------- - args : tuple - The below arguments as a tuple for pool.imap_unordered rather than pool.starmap. + args : tuple + The below arguments as a tuple for pool.imap_unordered rather than pool.starmap. - input_path : pathlib.Path - The path to the data file. + input_path : pathlib.Path + The path to the data file. - partitions_dir : pathlib.Path - The path to where output file should be stored. + partitions_dir : pathlib.Path + The path to where output file should be stored. - article_limit : int (default=None) - An optional article_limit of the number of articles to find. + article_limit : int (default=None) + An optional article_limit of the number of articles to find. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the processes. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the processes. Returns ------- - A parsed file Wikipedia dump file with articles. + A parsed file Wikipedia dump file with articles. """ input_path, partitions_dir, article_limit, verbose = args @@ -296,30 +296,30 @@ def parse_to_ndjson( Parameters ---------- - output_path : str (default=articles) - The name of the final output ndjson file. + output_path : str (default=articles) + The name of the final output ndjson file. - input_dir : str (default=wikipedia_dump) - The path to the directory where the data is stored. + input_dir : str (default=wikipedia_dump) + The path to the directory where the data is stored. - partitions_dir : str (default=partitions) - The path to the directory where the output should be stored. + partitions_dir : str (default=partitions) + The path to the directory where the output should be stored. - article_limit : int (default=None) - An optional limit of the number of articles per dump file to find. + article_limit : int (default=None) + An optional limit of the number of articles per dump file to find. - delete_parsed_files : bool (default=False) - Whether to delete the separate parsed files after combining them. + delete_parsed_files : bool (default=False) + Whether to delete the separate parsed files after combining them. - multicore : bool (default=True) - Whether to use multicore processing. + multicore : bool (default=True) + Whether to use multicore processing. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the processes. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the processes. Returns ------- - Wikipedia dump files parsed and converted to json files. + Wikipedia dump files parsed and converted to json files. """ output_dir = "/".join(list(output_path.split("/")[:-1])) if not output_dir.exists(): diff --git a/src/scribe_data/wikipedia/process_wiki.py b/src/scribe_data/wikipedia/process_wiki.py index bd5bbb162..458e5dda1 100644 --- a/src/scribe_data/wikipedia/process_wiki.py +++ b/src/scribe_data/wikipedia/process_wiki.py @@ -54,25 +54,25 @@ def clean( Parameters ---------- - texts : str or list - The texts to be cleaned and tokenized. + texts : str or list + The texts to be cleaned and tokenized. - language : string (default=en) - The language of the texts being cleaned. + language : string (default=en) + The language of the texts being cleaned. - remove_words : str or list (default=None) - Strings that should be removed from the text body. + remove_words : str or list (default=None) + Strings that should be removed from the text body. - sample_size : float (default=1) - The amount of data to be randomly sampled. + sample_size : float (default=1) + The amount of data to be randomly sampled. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the process. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the process. Returns ------- - cleaned_texts : list - The texts formatted for analysis. + cleaned_texts : list + The texts formatted for analysis. """ if isinstance(texts, str): texts = [texts] @@ -331,27 +331,27 @@ def gen_autosuggestions( Parameters ---------- - text_corpus : list - The Wikipedia texts formatted for word relation extraction. + text_corpus : list + The Wikipedia texts formatted for word relation extraction. - language : string (default=en) - The language autosuggestions are being generated for. + language : string (default=en) + The language autosuggestions are being generated for. - num_words: int (default=500) - The number of words that autosuggestions should be generated for. + num_words: int (default=500) + The number of words that autosuggestions should be generated for. - ignore_words : str or list (default=None) - Strings that should be removed from the text body. + ignore_words : str or list (default=None) + Strings that should be removed from the text body. - update_local_data : bool (default=False) - Saves the created dictionaries as JSONs in the target directories. + update_local_data : bool (default=False) + Saves the created dictionaries as JSONs in the target directories. - verbose : bool (default=True) - Whether to show a tqdm progress bar for the process. + verbose : bool (default=True) + Whether to show a tqdm progress bar for the process. Returns ------- - Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps. + Autosuggestions dictionaries for common words are saved locally or uploaded to Scribe apps. """ counter_obj = Counter(chain.from_iterable(text_corpus)) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py new file mode 100644 index 000000000..45f00d192 --- /dev/null +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -0,0 +1,603 @@ +""" +Functions for parsing Wikidata lexeme dumps. + +.. raw:: html + +""" + +import bz2 +import time +from collections import Counter, defaultdict +from pathlib import Path +from typing import List, Union + +import orjson +import questionary +from scribe_data.utils import ( + DEFAULT_DUMP_EXPORT_DIR, + check_index_exists, + data_type_metadata, + language_metadata, +) +from tqdm import tqdm + + +class LexemeProcessor: + def __init__( + self, + target_iso: Union[str, List[str]] = None, + parse_type: List[str] = None, + data_types: List[str] = None, + ): + """ + parse_type can be any combination of: + - 'translations' + - 'form' + - 'total' + data_types is a list of categories (e.g., ["nouns", "adverbs"]) for forms. + """ + # Pre-compute sets for faster lookups. + self.parse_type = set(parse_type or []) + self.data_types = set(data_types or []) + self.target_iso = set( + [target_iso] if isinstance(target_iso, str) else target_iso or [] + ) + + # Pre-compute valid categories and languages. + self._category_lookup = {v: k for k, v in data_type_metadata.items()} + self.valid_categories = set(data_type_metadata.values()) + + # Build optimized language mapping. + self.iso_to_name = self._build_iso_mapping() + self.valid_iso_codes = set(self.iso_to_name.keys()) + + # Separate data structures. + self.translations_index = defaultdict( + lambda: defaultdict(lambda: defaultdict(dict)) + ) + self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + + # Stats. + self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} + + # For category lookups, invert data_type_metadata. + # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}. + self._category_lookup = {v: k for k, v in data_type_metadata.items()} + + # Build map from ISO to full language name. + self.iso_to_name = self._build_iso_mapping() + + # For "total" usage. + self.lexical_category_counts = defaultdict(Counter) + self.translation_counts = defaultdict(Counter) + self.forms_counts = defaultdict(Counter) + + # MARK: build iso mapping + def _build_iso_mapping(self) -> dict: + """ + Build mapping of ISO codes to language names based on language_metadata. + If self.target_iso is non-null, only include those iso codes. + """ + iso_mapping = {} + for lang_name, data in language_metadata.items(): + if self.target_iso and lang_name not in self.target_iso: + continue + + if iso_code := data.get("iso"): + iso_mapping[iso_code] = lang_name + + return iso_mapping + + # MARK: process total + def _process_lexeme_total(self, lexeme: dict) -> None: + """ + Gather stats if 'total' is in parse_type: how many entries per language & category, + how many translations, etc. + """ + lexicalCategory = lexeme.get("lexicalCategory") + if not lexicalCategory or lexicalCategory not in data_type_metadata.values(): + return + + category_name = self._category_lookup.get(lexicalCategory) + if not category_name: + return + + # Update counters. + lemmas = lexeme.get("lemmas", {}) + for lemma in lemmas.values(): + lang = lemma.get("language") + + if lang in self.iso_to_name: + self.lexical_category_counts[lang][category_name] += 1 + translation_count = sum( + len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) + ) + self.translation_counts[lang][category_name] += translation_count + + break + + # MARK: process translations + def _process_lexeme_translations(self, lexeme: dict) -> None: + """ + Process gloss-based translations if 'translations' is in parse_type. + Store them in self.translations_index. + """ + lemmas = lexeme.get("lemmas", {}) + qid = lexeme.get("lexicalCategory") + + if not (lemmas and qid): + return + + category_name = self._category_lookup.get(qid) + if not category_name: + return + + # Only store first valid lemma for translations. + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.iso_to_name: + continue + + word = lemma_data.get("value", "").lower() + if not word: + continue + + # Build translations from sense glosses. + translations = {} + for sense in lexeme.get("senses", []): + for sense_lang_code, gloss in sense.get("glosses", {}).items(): + if sense_lang_code in self.iso_to_name: + translations[sense_lang_code] = gloss["value"] + + if translations: + self.translations_index[word][lang_code][category_name] = translations + + break # only handle the first lemma + + # MARK: process forms + def _process_lexeme_forms(self, lexeme: dict) -> None: + """ + Process forms for categories in self.data_types if 'form' is in parse_type. + Store them in self.forms_index. + """ + lemmas = lexeme.get("lemmas", {}) + lexical_category = lexeme.get("lexicalCategory") + + # Skip if category missing or not recognized. + if not lexical_category or lexical_category not in data_type_metadata.values(): + return + + # Convert Q1084 -> "nouns", etc. + category_name = self._category_lookup.get(lexical_category) + if not category_name: + return + + # If the category_name is NOT in our data_types list, skip + # e.g., category_name = "nouns", but user didn't request "nouns" in data_types. + if category_name not in self.data_types: + return + + # Process forms. + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.iso_to_name: + continue + + word = lemma_data.get("value", "").lower() + if not word: + continue + + forms_data = defaultdict(list) + for form in lexeme.get("forms", []): + representations = form.get("representations", {}) + grammatical_features = form.get("grammaticalFeatures", []) + + for rep_lang, rep_data in representations.items(): + if rep_lang == lang_code: + if form_value := rep_data.get("value"): + forms_data[form_value].extend(grammatical_features) + + if forms_data: + self.forms_index[word][lang_code][category_name] = dict(forms_data) + self.forms_counts[lang_code][category_name] += len(forms_data) + + break # only first valid lemma + + # MARK: process lines + def process_lines(self, line: str) -> None: + """ + Process one line of data. Depending on parse_type, we do: + - total stats + - translations + - form categories (filtered by data_types) + """ + try: + lexeme = orjson.loads(line.strip().rstrip(",")) + if not lexeme: + return + + # Get common values once. + lemmas = lexeme.get("lemmas", {}) + lexical_category = lexeme.get("lexicalCategory") + + if not (lemmas and lexical_category in self.valid_categories): + return + + category_name = self._category_lookup.get(lexical_category) + if not category_name: + return + + # Process each type in a single pass through the data. + for lang_code, lemma_data in lemmas.items(): + if lang_code not in self.valid_iso_codes: + continue + + word = lemma_data.get("value", "").lower() + if not word: + continue + + if "total" in self.parse_type: + self.lexical_category_counts[lang_code][category_name] += 1 + translation_count = sum( + len(sense.get("glosses", {})) + for sense in lexeme.get("senses", []) + ) + self.translation_counts[lang_code][category_name] += ( + translation_count + ) + + if "translations" in self.parse_type: + if translations := { + lang: gloss["value"] + for sense in lexeme.get("senses", []) + for lang, gloss in sense.get("glosses", {}).items() + if lang in self.valid_iso_codes + }: + self.translations_index[word][lang_code][category_name] = ( + translations + ) + + if "form" in self.parse_type and category_name in self.data_types: + forms_data = defaultdict(list) + for form in lexeme.get("forms", []): + for rep_lang, rep_data in form.get( + "representations", {} + ).items(): + if rep_lang == lang_code: + if form_value := rep_data.get("value"): + forms_data[form_value].extend( + form.get("grammaticalFeatures", []) + ) + + if forms_data: + self.forms_index[word][lang_code][category_name] = dict( + forms_data + ) + self.forms_counts[lang_code][category_name] += len(forms_data) + + break # only process first valid lemma + + except Exception as e: + print(f"Error processing line: {e}") + + # MARK: process file + def process_file(self, file_path: str, batch_size: int = 50000): + """ + Main loop: read lines from file (bz2) in batches, call process_lines on each. + """ + # Use context manager for better resource handling. + with bz2.open(file_path, "rt", encoding="utf-8") as bzfile: + # Skip header if present. + first_line = bzfile.readline() + if not first_line.strip().startswith("["): + bzfile.seek(0) + + # Process in larger batches for better performance. + batch = [] + start_time = time.time() + total_entries = int(Path(file_path).stat().st_size / 263) + + for line in tqdm(bzfile, total=total_entries, desc="Processing entries"): + if line.strip() not in ["[", "]", ",", ""]: + batch.append(line) + + if len(batch) >= batch_size: + self._process_batch(batch) + batch.clear() # more efficient than creating new list + self.stats["processed_entries"] += 1 + + # Process remaining items. + if batch: + self._process_batch(batch) + + # Update stats. + self.stats["processing_time"] = time.time() - start_time + self.stats["unique_words"] = len(self.forms_index) + len( + self.translations_index + ) + + # Print summary if "total" was requested. + if "total" in self.parse_type: + self._print_total_summary() + + def _process_batch(self, batch: list) -> None: + """ + Process a batch of lines. + """ + for line in batch: + self.process_lines(line) + + # MARK: print total summary + def _print_total_summary(self): + """ + Print stats if parse_type == total. + """ + print( + f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25} {'Total Translations':<20}" + ) + print("=" * 90) + for lang, counts in self.lexical_category_counts.items(): + lang_name = self.iso_to_name[lang] + first_row = True + + for category, count in counts.most_common(): + trans_count = self.translation_counts[lang][category] + + if first_row: + print( + f"{lang_name:<20} {category:<25} {count:<25,} {trans_count:<20,}" + ) + first_row = False + + else: + print(f"{'':<20} {category:<25} {count:<25,} {trans_count:<20,}") + + if lang != list(self.lexical_category_counts.keys())[-1]: + print("\n" + "=" * 90 + "\n") + + # MARK: export translations + def export_translations_json(self, filepath: str, language_iso: str = None) -> None: + """ + Save translations_index to file, optionally filtering by language_iso. + """ + if language_iso: + if language_iso not in self.iso_to_name: + print( + f"Warning: ISO {language_iso} unknown, skipping translations export..." + ) + return + + filtered = { + word: {language_iso: lang_data[language_iso]} + for word, lang_data in self.translations_index.items() + if language_iso in lang_data + } + self._save_by_language(filtered, filepath, language_iso, "translations") + + # MARK: export forms + def export_forms_json( + self, filepath: str, language_iso: str = None, data_type: str = None + ) -> None: + """ + Save forms_index to file, optionally filtering by: + - language_iso + - data_type (e.g. "nouns", "adverbs") + + If data_type is given, we only export that one category from forms. + """ + if language_iso: + if language_iso not in self.iso_to_name: + print(f"Warning: ISO {language_iso} unknown, skipping forms export...") + return + + filtered = {} + for word, lang_data in self.forms_index.items(): + if language_iso in lang_data: + # If data_type is given, only keep that category. + if data_type: + if data_type in lang_data[language_iso]: + filtered[word] = { + language_iso: { + data_type: lang_data[language_iso][data_type] + } + } + + else: + filtered[word] = {language_iso: lang_data[language_iso]} + + self._save_by_language( + filtered, filepath, language_iso, data_type or "forms" + ) + + def _save_by_language(self, data, filepath, language_iso, category_type): + """ + Save data to exports//filename. + """ + base_path = Path(filepath) + lang_name = self.iso_to_name[language_iso] + + lang_filepath = base_path.parent / lang_name / base_path.name + lang_filepath.parent.mkdir(parents=True, exist_ok=True) + + print(f"Saving {lang_name} {category_type} index to {lang_filepath}...") + with open(lang_filepath, "wb") as f: + f.write( + orjson.dumps( + self._to_dict(data), + option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS, + ) + ) + + def _to_dict(self, dd): + """ + Recursively convert defaultdict to dict. + """ + if isinstance(dd, defaultdict): + dd = {k: self._to_dict(v) for k, v in dd.items()} + + return dd + + +# MARK: parse dump +def parse_dump( + language: Union[str, List[str]] = None, + parse_type: List[str] = None, + data_types: List[str] = None, + file_path: str = "latest-lexemes.json.bz2", + output_dir: str = None, + overwrite_all: bool = False, +): + """ + Parse a Wikidata lexeme dump file and extract linguistic data. + + Parameters + ---------- + language : str or list of str, optional + Language(s) to parse data for. Must match language names in language_metadata. + + parse_type : list of str, optional + Types of parsing to perform. Valid options are: + - 'translations': Extract word translations + - 'form': Extract grammatical forms + - 'total': Gather statistical totals + + data_types : list of str, optional + Categories to parse when using 'form' type (e.g. ["nouns", "adverbs"]). + Only used if 'form' is in parse_type. + + file_path : str, default="latest-lexemes.json.bz2" + Path to the lexeme dump file + + output_dir : str, optional + Directory to save output files. If None, uses DEFAULT_DUMP_EXPORT_DIR. + + overwrite_all : bool, default=False + If True, automatically overwrite existing files without prompting + + Notes + ----- + The function processes a Wikidata lexeme dump and extracts linguistic data based on + the specified parameters. For each language and data type combination, it creates + separate JSON files in the output directory structure: + + If a requested index file already exists, that language/category combination + will be skipped. + """ + # Prepare environment - Use default if output_dir is None. + output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Convert single strings to lists. + languages = [language] if isinstance(language, str) else language + parse_type = parse_type or [] + data_types = data_types or [] + + print(f"Languages: {languages}") + print(f"parse_type: {parse_type}") + if data_types: + print(f"data_types for forms: {data_types}") + + if "total" not in parse_type: + choice = questionary.select( + "Choose an action:", + choices=["Overwrite existing data", "Skip process"], + default="Skip process", + ).ask() + if choice == "Overwrite existing data": + overwrite_all = True + + # For translations, we only need to check the translations index. + if "translations" in parse_type: + languages_to_process = [] + for lang in languages: + index_path = Path(output_dir) / lang / "lexeme_translations.json" + + if not check_index_exists(index_path, overwrite_all): + languages_to_process.append(lang) + + else: + print(f"Skipping {lang}/translations.json - already exists") + + # Update languages list but keep data_types as is. + languages = languages_to_process + + # For forms, check each language/data_type combination. + elif "form" in parse_type: + languages_to_process = [] + data_types_to_process = set() + + for lang in languages: + needs_processing = False + for data_type in data_types: + index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json" + + if not check_index_exists(index_path, overwrite_all): + needs_processing = True + data_types_to_process.add(data_type) + + else: + print(f"Skipping {lang}/{data_type}.json - already exists") + + if needs_processing: + languages_to_process.append(lang) + + # Update both lists. + languages = languages_to_process + data_types = list(data_types_to_process) + + print(f"Languages to process: {languages}") + if data_types: + print(f"Data types to process: {data_types}") + + if not languages: + print("All requested data already exists. Nothing to process.") + return + + processor = LexemeProcessor( + target_iso=languages, parse_type=parse_type, data_types=data_types + ) + processor.process_file(file_path) + + # MARK: Handle JSON exports + + # (a) If "translations" in parse_type -> export them. + if "translations" in parse_type: + index_path = Path(output_dir) / "lexeme_translations.json" + + # Export translations for each ISO found. + iso_codes = set() + for word_data in processor.translations_index.values(): + iso_codes.update(word_data.keys()) + for iso_code in iso_codes: + if iso_code in processor.iso_to_name: + processor.export_translations_json(str(index_path), iso_code) + + # (b) If "form" in parse_type -> export forms for each data_type in data_types. + if "form" in parse_type: + # For each data_type, we create a separate file, e.g. lexeme_nouns.json. + for dt in data_types: + index_path = Path(output_dir) / f"lexeme_{dt}.json" + print(f"Exporting forms for {dt} to {index_path}...") + + iso_codes = set() + for word_data in processor.forms_index.values(): + iso_codes.update(word_data.keys()) + + for iso_code in iso_codes: + if iso_code in processor.iso_to_name: + processor.export_forms_json( + filepath=str(index_path), language_iso=iso_code, data_type=dt + ) diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py new file mode 100644 index 000000000..6968c8adc --- /dev/null +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -0,0 +1,136 @@ +""" +Functions to parse the translations of a word from MediaWiki API. + +.. raw:: html + +""" + +import json +import re + +from scribe_data.utils import get_language_from_iso +from scribe_data.wikidata.wikidata_utils import mediaWiki_query + + +def fetch_translation_page(word): + data = mediaWiki_query(word) + + pages = data.get("query", {}).get("pages", {}) + # Extract page object from dictionary. + page = next(iter(pages.values())) if pages else {} + + # Get the wikitext from the 'revisions' key. + return page.get("revisions", [{}])[0].get("*", "") + + +def parse_wikitext_for_translations(wikitext): + """ + Parse the wikitext line by line to extract translations, + language codes, part of speech, and context. + """ + translations_by_lang = {} + current_part_of_speech = None # track whether we are in Noun or Verb + current_context = None # track the current trans-top context + + # Split the wikitext into individual lines + for line in wikitext.splitlines(): + # Detect part of speech/data-types: Noun or Verb. + if line.startswith("===Noun==="): + current_part_of_speech = "Noun" + + elif line.startswith("===Verb==="): + current_part_of_speech = "Verb" + + if trans_top_match := re.match(r"\{\{trans-top\|(.+?)\}\}", line): + current_context = trans_top_match[1].strip() + + if template_match := re.match( + r"^\*\s([A-Za-z\s]+):\s\{\{t\+?\|([a-zA-Z\-]+)\|([^|]+)\}\}", + line.strip(), + ): + lang_code = template_match[2].strip() + translation_text = template_match[3].strip() + + # Ensure there's a list to hold translations for this language. + if lang_code not in translations_by_lang: + translations_by_lang[lang_code] = [] + + translations_by_lang[lang_code].append( + { + "translation": translation_text, + "part_of_speech": current_part_of_speech, + "context": current_context, + } + ) + + return translations_by_lang + + +def build_json_format(word, translations_by_lang): + """ + Build the final JSON format for the translations of a word. + """ + book_translations = {word: {}} + # Keep counters to number the translations for each (lang, part_of_speech). + language_counters = {} + + for lang_code, entries in translations_by_lang.items(): + try: + lang_name = get_language_from_iso(lang_code) + except ValueError: + # Skip this language if it's not supported. + continue + + # Make sure this language is in the dictionary. + if lang_name not in book_translations[word]: + book_translations[word][lang_name] = {} + + for item in entries: + pos = item["part_of_speech"] or "Unknown" + desc = item["context"] + trans = item["translation"] + + if pos not in book_translations[word][lang_name]: + book_translations[word][lang_name][pos] = {} + language_counters[(lang_code, pos)] = 1 + + idx = str(language_counters[(lang_code, pos)]) + + # Insert the item at the next available index. + book_translations[word][lang_name][pos][idx] = { + "description": desc, + "translations": trans, + } + language_counters[(lang_code, pos)] += 1 + + return book_translations + + +def parse_wiktionary_translations(word): + """ + Parse the translations of a word from Wiktionary. + """ + wikitext = fetch_translation_page(word) + translations_by_lang = parse_wikitext_for_translations(wikitext) + + if not translations_by_lang: + print("No translations found") + return + + final_json = build_json_format(word, translations_by_lang) + print(json.dumps(final_json, indent=4, ensure_ascii=False)) diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py index 5dfa5830b..d4987b22c 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/test_download.py @@ -101,53 +101,77 @@ def test_download_wd_lexeme_dump_by_date(self, mock_findall, mock_get): ) @patch("scribe_data.cli.download.requests.get") - @patch("scribe_data.cli.download.input", return_value="y") @patch( - "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=None + "scribe_data.cli.download.check_lexeme_dump_prompt_download", return_value=False ) @patch("scribe_data.cli.download.open", new_callable=mock_open) @patch("scribe_data.cli.download.tqdm") - @patch("scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir") + @patch("scribe_data.cli.download.os.makedirs") + @patch("scribe_data.cli.download.questionary.confirm") def test_wd_lexeme_dump_download_wrapper_latest( - self, mock_tqdm, mock_file, mock_check_prompt, mock_input, mock_get + self, + mock_confirm, + mock_makedirs, + mock_tqdm, + mock_file, + mock_check_prompt, + mock_get, ): """ Test wrapper function for downloading latest Wikidata lexeme dump. """ + mock_confirm.return_value.ask.return_value = True + mock_get.return_value.text = 'href="latest-all.json.bz2"' mock_get.return_value.raise_for_status = MagicMock() mock_get.return_value.headers = {"content-length": "100"} mock_get.return_value.iter_content = lambda chunk_size: [b"data"] * 10 - with patch("scribe_data.cli.download.os.makedirs") as mock_makedirs: + # Mock DEFAULT_DUMP_EXPORT_DIR. + with patch( + "scribe_data.cli.download.DEFAULT_DUMP_EXPORT_DIR", new="test_export_dir" + ): download_path = wd_lexeme_dump_download_wrapper() + self.assertIsNotNone(download_path, "Download path should not be None") self.assertIn("latest-lexemes.json.bz2", download_path) mock_makedirs.assert_called_with("test_export_dir", exist_ok=True) + mock_confirm.assert_called_once() - def test_check_lexeme_dump_prompt_download_existing(self): + @patch("scribe_data.utils.questionary.select") + @patch( + "scribe_data.utils.Path.glob", + return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], + ) + def test_check_lexeme_dump_prompt_download_existing(self, mock_glob, mock_select): """ Test prompt for using existing lexeme dump files. """ - with patch( - "scribe_data.utils.Path.glob", - return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], - ): - with patch("builtins.input", return_value="u"): - result = check_lexeme_dump_prompt_download( - "scribe_data/tests/cli/test_export_dir" - ) - self.assertEqual(result.name, "latest-lexemes.json.bz2") + # Mock the select dialog to return "Use existing latest dump". + mock_select.return_value.ask.return_value = "Use existing latest dump" + + result = check_lexeme_dump_prompt_download( + "scribe_data/tests/cli/test_export_dir" + ) + self.assertEqual(result.name, "latest-lexemes.json.bz2") - def test_check_lexeme_dump_prompt_download_delete(self): + @patch("scribe_data.utils.questionary.select") + @patch( + "scribe_data.utils.Path.glob", + return_value=[Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")], + ) + def test_check_lexeme_dump_prompt_download_delete(self, mock_glob, mock_select): """ Test prompt for deleting existing lexeme dump files. """ - mock_existing_files = [Path("dump1.json.bz2"), Path("latest-lexemes.json.bz2")] - with patch("scribe_data.utils.Path.glob", return_value=mock_existing_files): - with patch("builtins.input", side_effect=["d", "n"]): - with patch("scribe_data.utils.Path.unlink") as mock_unlink: - result = check_lexeme_dump_prompt_download( - "scribe_data/tests/cli/test_export_dir" - ) - self.assertTrue(mock_unlink.called) - self.assertTrue(result) + # Configure the mock to return "Delete existing dumps" first and then "No". + mock_select.side_effect = [ + MagicMock(ask=MagicMock(return_value="Delete existing dumps")), + MagicMock(ask=MagicMock(return_value="No")), + ] + + with patch("scribe_data.utils.Path.unlink") as mock_unlink: + result = check_lexeme_dump_prompt_download( + "scribe_data/tests/cli/test_export_dir" + ) + self.assertTrue(mock_unlink.called) + self.assertTrue(result) diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 8cf750904..914fbe9e6 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -62,37 +62,37 @@ def test_invalid_arguments(self): # MARK: All Data - @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_data_types_for_language(self, mock_query_data): - """ - Test retrieving all data types for a specific language. - - Ensures that `query_data` is called properly when `--all` flag is used with a language. - """ - get_data(all_bool=True, language="English") - mock_query_data.assert_called_once_with( - languages=["English"], - data_type=None, - output_dir="scribe_data_json_export", - overwrite=False, - ) - - @patch("scribe_data.cli.get.query_data") - @patch("builtins.input", lambda _: "N") # don't use dump - def test_get_all_languages_for_data_type(self, mock_query_data): - """ - Test retrieving all languages for a specific data type. - - Ensures that `query_data` is called properly when `--all` flag is used with a data type. - """ - get_data(all_bool=True, data_type="nouns") - mock_query_data.assert_called_once_with( - languages=None, - data_type=["nouns"], - output_dir="scribe_data_json_export", - overwrite=False, - ) + # @patch("scribe_data.cli.get.query_data") + # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False) + # def test_get_all_data_types_for_language(self, mock_prompt, mock_query_data): + # """ + # Test retrieving all data types for a specific language. + + # Ensures that `query_data` is called properly when `--all` flag is used with a language. + # """ + # get_data(all_bool=True, language="English") + # mock_query_data.assert_called_once_with( + # languages=["English"], + # data_type=None, + # output_dir="scribe_data_json_export", + # overwrite=False, + # ) + + # @patch("scribe_data.cli.get.query_data") + # @patch("scribe_data.cli.get.prompt_user_download_all", return_value=False) + # def test_get_all_languages_for_data_type(self, mock_prompt, mock_query_data): + # """ + # Test retrieving all languages for a specific data type. + + # Ensures that `query_data` is called properly when `--all` flag is used with a data type. + # """ + # get_data(all_bool=True, data_type="nouns") + # mock_query_data.assert_called_once_with( + # languages=None, + # data_type=["nouns"], + # output_dir="scribe_data_json_export", + # overwrite=False, + # ) # MARK: Language and Data Type @@ -115,7 +115,8 @@ def test_get_specific_language_and_data_type(self, mock_query_data): # MARK: Capitalized Language @patch("scribe_data.cli.get.query_data") - def test_get_data_with_capitalized_language(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_capitalized_language(self, mock_glob, mock_query_data): """ Test retrieving data with a capitalized language. @@ -133,7 +134,8 @@ def test_get_data_with_capitalized_language(self, mock_query_data): # MARK: Lowercase Language @patch("scribe_data.cli.get.query_data") - def test_get_data_with_lowercase_language(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_lowercase_language(self, mock_glob, mock_query_data): """ Test retrieving data with a lowercase language. @@ -171,7 +173,8 @@ def test_get_data_with_different_output_directory(self, mock_query_data): # MARK: Overwrite is True @patch("scribe_data.cli.get.query_data") - def test_get_data_with_overwrite_true(self, mock_query_data): + @patch("scribe_data.cli.get.Path.glob", return_value=[]) + def test_get_data_with_overwrite_true(self, mock_glob, mock_query_data): """ Test retrieving data with the overwrite flag set to True. diff --git a/tests/wikidata/test_check_query.py b/tests/wikidata/test_check_query.py index e50b3955e..19b3097b1 100755 --- a/tests/wikidata/test_check_query.py +++ b/tests/wikidata/test_check_query.py @@ -25,6 +25,7 @@ from pathlib import Path from unittest.mock import MagicMock, mock_open, patch from urllib.error import HTTPError + import pytest from scribe_data.wikidata.check_query.check import ( all_queries,