Skip to content

Commit

Permalink
Merge pull request #536 from axif0/all_in_one
Browse files Browse the repository at this point in the history
Feat: Translation cmd for scribe-data
  • Loading branch information
andrewtavis authored Jan 4, 2025
2 parents b27af41 + f28a176 commit 4132f57
Show file tree
Hide file tree
Showing 31 changed files with 1,679 additions and 640 deletions.
13 changes: 9 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,18 @@ def example_function(argument: argument_type) -> return_type:
Parameters
----------
argument: argument_type
Description of your argument.
argument : argument_type
Description of your argument.
Returns
-------
return_value : return_type
Description of your return value.
return_value : return_type
Description of your return value.
Raises
------
ErrorType
Description of the error and the condition that raises it.
"""

...
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ ruff>=0.3.3
SPARQLWrapper>=2.0.0
sphinx-rtd-theme>=3.0.0
tqdm==4.66.4
orjson>=3.10.12
26 changes: 13 additions & 13 deletions src/scribe_data/check/check_project_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,16 @@ def get_missing_languages(
Parameters
----------
reference_languages : dict
A dictionary of languages from the reference source.
reference_languages : dict
A dictionary of languages from the reference source.
target_languages : dict
A dictionary of languages from the target source to check for missing entries.
target_languages : dict
A dictionary of languages from the target source to check for missing entries.
Returns
-------
list[str]
A list of languages and sub-languages that are in target_languages but not in reference_languages.
list[str]
A list of languages and sub-languages that are in target_languages but not in reference_languages.
"""
missing_languages = []
reference_keys = reference_languages.keys()
Expand Down Expand Up @@ -130,17 +130,17 @@ def validate_language_properties(languages_dict: dict) -> dict:
Parameters
----------
languages_dict : dict
A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
languages_dict : dict
A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
Returns
-------
dict: A dictionary with two lists:
- "missing_qids": Languages or sub-languages missing the 'qid' property.
- "missing_isos": Languages or sub-languages missing the 'iso' property.
dict: A dictionary with two lists:
- "missing_qids": Languages or sub-languages missing the 'qid' property.
- "missing_isos": Languages or sub-languages missing the 'iso' property.
Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
or simply "parent_language" for the parent languages.
Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
or simply "parent_language" for the parent languages.
"""
missing_qids = []
missing_isos = []
Expand Down
22 changes: 11 additions & 11 deletions src/scribe_data/check/check_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,24 @@ def check_for_sparql_files(folder_path, data_type, language, subdir, missing_que
Parameters
----------
folder_path : str
The path to the data-type folder.
folder_path : str
The path to the data-type folder.
data_type : str
The name of the data type being checked.
data_type : str
The name of the data type being checked.
language : str
The name of the language being processed.
language : str
The name of the language being processed.
subdir : str or None
The name of the sub-directory (for languages with sub-dialects), or None.
subdir : str or None
The name of the sub-directory (for languages with sub-dialects), or None.
missing_queries : list
A list to which missing SPARQL query files will be appended.
missing_queries : list
A list to which missing SPARQL query files will be appended.
Returns
-------
bool: True if at least one .sparql file is found, False otherwise.
bool: True if at least one .sparql file is found, False otherwise.
"""
sparql_files = [f for f in os.listdir(folder_path) if f.endswith(".sparql")]

Expand Down
29 changes: 14 additions & 15 deletions src/scribe_data/check/check_pyicu.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
from pathlib import Path

import pkg_resources
import questionary
import requests
from questionary import confirm


def check_if_pyicu_installed():
Expand Down Expand Up @@ -90,15 +90,15 @@ def download_wheel_file(wheel_url, output_dir):
Parameters
----------
wheel_url : str
The URL of the wheel file to download.
wheel_url : str
The URL of the wheel file to download.
output_dir : str
The directory to save the downloaded file.
output_dir : str
The directory to save the downloaded file.
Returns
-------
str : path to the downloaded wheel file.
str : path to the downloaded wheel file.
"""
response = requests.get(wheel_url)
response.raise_for_status() # raise an error for bad responses
Expand All @@ -118,18 +118,18 @@ def find_matching_wheel(wheels, python_version, architecture):
Parameters
----------
wheels : list
The list of available wheels.
wheels : list
The list of available wheels.
python_version : str
The Python version (e.g., 'cp311').
python_version : str
The Python version (e.g., 'cp311').
architecture : str
The architecture type (e.g., 'win_amd64').
architecture : str
The architecture type (e.g., 'win_amd64').
Returns
-------
str : The download URL of the matching wheel or None if not found.
str : The download URL of the matching wheel or None if not found.
"""
return next(
(
Expand All @@ -148,8 +148,7 @@ def check_and_install_pyicu():
# Fetch available wheels from GitHub to estimate download size.
wheels, total_size_mb = fetch_wheel_releases()

# Use questionary to ask for user confirmation
user_wants_to_proceed = confirm(
user_wants_to_proceed = questionary.confirm(
f"{package_name} is not installed.\nScribe-Data can install the package and the needed dependencies."
f"\nApproximately {total_size_mb:.2f} MB will be downloaded.\nDo you want to proceed?"
).ask()
Expand Down
98 changes: 49 additions & 49 deletions src/scribe_data/check/check_query_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,18 @@ def extract_forms_from_sparql(file_path: Path) -> str:
Parameters
----------
file_path : Path
The path to the SPARQL query file from which to extract forms.
file_path : Path
The path to the SPARQL query file from which to extract forms.
Returns
-------
query_form_dict : dict
The file path with form labels of the query and their respective QIDs.
query_form_dict : dict
The file path with form labels of the query and their respective QIDs.
Raises
------
FileNotFoundError
If the specified file does not exist.
FileNotFoundError
If the specified file does not exist.
"""
optional_pattern = r"\s\sOPTIONAL\s*\{([^}]*)\}"
try:
Expand All @@ -95,13 +95,13 @@ def extract_form_rep_label(form_text: str):
Parameters
----------
form_text : str
The text that defines the form within the query.
form_text : str
The text that defines the form within the query.
Returns
-------
str
The label of the form representation.
str
The label of the form representation.
"""
onto_rep_pattern = r"ontolex:representation .* ;"
if line_match := re.search(pattern=onto_rep_pattern, string=form_text):
Expand All @@ -119,13 +119,13 @@ def decompose_label_features(label):
Parameters
----------
label : str
The concatenated label string composed of several grammatical features.
label : str
The concatenated label string composed of several grammatical features.
Returns
-------
list
A list of grammatical features extracted from the label in their original order.
list
A list of grammatical features extracted from the label in their original order.
"""
components = re.findall(r"[A-Za-z][^A-Z]*", label)
valid_components = []
Expand Down Expand Up @@ -157,13 +157,13 @@ def extract_form_qids(form_text: str):
Parameters
----------
form_text : str
The text that defines the form within the query.
form_text : str
The text that defines the form within the query.
Returns
-------
list[str]
All QIDS that make up the form.
list[str]
All QIDS that make up the form.
"""
qids_pattern = r"wikibase:grammaticalFeature .+ \."
if match := re.search(pattern=qids_pattern, string=form_text):
Expand All @@ -179,13 +179,13 @@ def check_form_label(form_text: str):
Parameters
----------
form_text : str
The text that defines the form within the query.
form_text : str
The text that defines the form within the query.
Returns
-------
bool
Whether the form and its current representation label match (repForm and rep).
bool
Whether the form and its current representation label match (repForm and rep).
"""
form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \."

Expand Down Expand Up @@ -221,13 +221,13 @@ def check_query_formatting(form_text: str):
Parameters
----------
query_text : str
The SPARQL query text to check.
query_text : str
The SPARQL query text to check.
Returns
-------
bool
Whether there are formatting errors with the query.
bool
Whether there are formatting errors with the query.
"""
# Check for spaces before commas that should not exist.
if re.search(r"\s,", form_text):
Expand All @@ -249,13 +249,13 @@ def return_correct_form_label(qids: list):
Parameters
----------
qids : list[str]
All QIDS that make up the form.
qids : list[str]
All QIDS that make up the form.
Returns
-------
correct_label : str
The label for the representation given the QIDs.
correct_label : str
The label for the representation given the QIDs.
"""
if not qids:
return "Invalid query formatting found"
Expand Down Expand Up @@ -289,14 +289,14 @@ def validate_forms(query_text: str) -> str:
Parameters
----------
query_file : str
The SPARQL query text as a string.
query_file : str
The SPARQL query text as a string.
Returns
-------
str
Error message if there are any issues with the order of variables or forms,
otherwise an empty string.
str
Error message if there are any issues with the order of variables or forms,
otherwise an empty string.
"""
select_pattern = r"SELECT\s+(.*?)\s+WHERE"

Expand Down Expand Up @@ -376,13 +376,13 @@ def check_docstring(query_text: str) -> bool:
Parameters
----------
query_text : str
The SPARQL query's text to be checked.
query_text : str
The SPARQL query's text to be checked.
Returns
-------
bool
True if the docstring is correctly formatted.
bool
True if the docstring is correctly formatted.
"""
# Split the text into lines.
query_lines = query_text.splitlines(keepends=True)
Expand Down Expand Up @@ -418,14 +418,14 @@ def check_forms_order(query_text):
Parameters
----------
query_text : str
The SPARQL query text containing the SELECT statement with variables.
query_text : str
The SPARQL query text containing the SELECT statement with variables.
Returns
-------
list or bool
A sorted list of variables if the ordering differs from the original,
otherwise a boolean indicating that the order matches.
list or bool
A sorted list of variables if the ordering differs from the original,
otherwise a boolean indicating that the order matches.
"""
select_pattern = r"SELECT\s+(.*?)\s+WHERE"

Expand Down Expand Up @@ -496,14 +496,14 @@ def check_optional_qid_order(query_file: str) -> str:
Parameters
----------
query_file : str
The path to the SPARQL query file to be checked.
query_file : str
The path to the SPARQL query file to be checked.
Returns
-------
str
A formatted string with details on any order mismatches in the QIDs, or an empty
string if all QIDs are correctly ordered.
str
A formatted string with details on any order mismatches in the QIDs, or an empty
string if all QIDs are correctly ordered.
"""
forms = extract_forms_from_sparql(query_file)
error_messages = []
Expand Down
Loading

0 comments on commit 4132f57

Please sign in to comment.