Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Translation cmd for scribe-data #536

Merged
merged 16 commits into from
Jan 4, 2025
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ ruff>=0.3.3
SPARQLWrapper>=2.0.0
sphinx-rtd-theme>=3.0.0
tqdm==4.66.4
orjson>=3.10.12
15 changes: 6 additions & 9 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import requests
from rich import print as rprint
from tqdm import tqdm
import questionary

from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download

Expand Down Expand Up @@ -244,16 +245,12 @@ def wd_lexeme_dump_download_wrapper(
filename = dump_url.split("/")[-1]
output_path = str(Path(output_dir) / filename)

user_response = (
input(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"\nDo you want to proceed? (y/n): "
)
.strip()
.lower()
)
user_response = questionary.confirm(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities. Do you want to proceed?",
default=True,
).ask()

if user_response == "y":
if user_response:
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
Expand Down
58 changes: 49 additions & 9 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from typing import List, Union

from rich import print as rprint
import questionary

from scribe_data.cli.convert import convert_wrapper
from scribe_data.unicode.generate_emoji_keywords import generate_emoji
Expand Down Expand Up @@ -108,16 +109,20 @@ def prompt_user_download_all():
"""
Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call.
"""
download_all_input = input(
"Do you want to query Wikidata, or would you rather use Wikidata lexeme dumps? (y/N): "
)
return download_all_input == "y"
return questionary.confirm(
"Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)",
default=False,
).ask()

if all_bool:
if language:
if prompt_user_download_all():
parse_wd_lexeme_dump()

parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
type_output_dir=output_dir,
)
else:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language: {language.title()}")
Expand All @@ -133,8 +138,12 @@ def prompt_user_download_all():

elif data_type:
if prompt_user_download_all():
parse_wd_lexeme_dump()

parse_wd_lexeme_dump(
language=None,
wikidata_dump_type=["form"],
data_types=[data_type],
type_output_dir=output_dir,
)
else:
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
Expand All @@ -152,13 +161,44 @@ def prompt_user_download_all():
rprint(
"[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
)
parse_wd_lexeme_dump()
parse_wd_lexeme_dump(
language="all",
wikidata_dump_type=["form", "translations"],
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)

# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
generate_emoji(language=language, output_dir=output_dir)

# MARK: Translations

elif data_type == "translations":
if language is None:
language = "all"
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["translations"],
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)
return

# MARK: Query Data using Wikidata Dump

elif wikidata_dump:
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)
return

# MARK: Query Data

elif language or data_type:
Expand Down
46 changes: 44 additions & 2 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@
# from scribe_data.cli.list import list_wrapper
from scribe_data.cli.get import get_data
from scribe_data.cli.total import total_wrapper
from scribe_data.cli.version import get_local_version
from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
from scribe_data.utils import (
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_DUMP_EXPORT_DIR,
data_type_metadata,
language_metadata,
list_all_languages,
Expand Down Expand Up @@ -262,6 +263,7 @@ def request_total_lexeme_loop():
choices=[
Choice("Configure total lexemes request", "total"),
Choice("Run total lexemes request", "run"),
Choice("Run total lexemes request with lexeme dumps", "run_all"),
Choice("Exit", "exit"),
],
).ask()
Expand All @@ -275,6 +277,18 @@ def request_total_lexeme_loop():
config.selected_languages, config.selected_data_types = [], []
rprint(THANK_YOU_MESSAGE)
break
elif choice == "run_all":
if wikidata_dump_path := prompt(
f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
):
wikidata_dump_path = Path(wikidata_dump_path)

parse_wd_lexeme_dump(
language=config.selected_languages,
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump_path,
)
break
elif choice == "exit":
return
else:
Expand Down Expand Up @@ -316,7 +330,6 @@ def start_interactive_mode(operation: str = None):
operation : str
The type of operation that interactive mode is being ran with.
"""
rprint(f"[bold cyan]Welcome to {get_local_version()} interactive mode![/bold cyan]")
while True:
# Check if both selected_languages and selected_data_types are empty.
if not config.selected_languages and not config.selected_data_types:
Expand All @@ -333,6 +346,12 @@ def start_interactive_mode(operation: str = None):
# Choice("See list of languages", "languages"),
Choice("Exit", "exit"),
]
elif operation == "translations":
choices = [
Choice("Configure translations request", "translations"),
# Choice("See list of languages", "languages"),
Choice("Exit", "exit"),
]

else:
choices = [
Expand All @@ -356,6 +375,29 @@ def start_interactive_mode(operation: str = None):
request_total_lexeme_loop()
break

elif choice == "translations":
prompt_for_languages()

if wikidata_dump_path := prompt(
f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
):
wikidata_dump_path = Path(wikidata_dump_path)

if output_dir := prompt(
f"Enter output directory (default: {config.output_dir}): "
):
config.output_dir = Path(output_dir)

parse_wd_lexeme_dump(
language=config.selected_languages,
wikidata_dump_type=["translations"],
data_types=None,
type_output_dir=config.output_dir,
wikidata_dump_path=wikidata_dump_path,
)

break

# elif choice == "languages":
# see_list_languages()
# break
Expand Down
45 changes: 41 additions & 4 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from pathlib import Path

from rich import print as rprint
from questionary import select

from scribe_data.cli.cli_utils import validate_language_and_data_type
from scribe_data.cli.convert import convert_wrapper
Expand Down Expand Up @@ -200,7 +201,8 @@ def main() -> None:
total_parser.add_argument(
"-wdp",
"--wikidata-dump-path",
type=str,
nargs="?",
const=True,
help="Path to a local Wikidata lexemes dump for running with '--all'.",
)

Expand Down Expand Up @@ -298,12 +300,22 @@ def main() -> None:
help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.",
)
download_parser.add_argument(
"-od",
"--output-dir",
"-wdp",
"--wikidata-dump-path",
type=str,
help="The output directory path for the downloaded dump.",
)

# MARK: Interactive

interactive_parser = subparsers.add_parser(
"interactive",
aliases=["i"],
help="Run in interactive mode.",
description="Run in interactive mode.",
)
interactive_parser._actions[0].help = "Show this help message and exit."

# MARK: Setup CLI

args = parser.parse_args()
Expand Down Expand Up @@ -400,9 +412,34 @@ def main() -> None:
wikidata_dump=args.wikidata_dump_version
if args.wikidata_dump_version != "latest"
else None,
output_dir=args.output_dir,
output_dir=args.wikidata_dump_path,
)

elif args.command in ["interactive", "i"]:
rprint(
f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]"
)
action = select(
"What would you like to do?",
choices=[
"Download a Wikidata dump",
"Check for totals",
"Get data",
"Get translations",
"Exit",
],
).ask()

if action == "Download a Wikidata dump":
wd_lexeme_dump_download_wrapper()
elif action == "Check for totals":
start_interactive_mode(operation="total")
elif action == "Get data":
start_interactive_mode(operation="get")
elif action == "Get translations":
start_interactive_mode(operation="translations")
else:
print("Skipping action")
else:
parser.print_help()

Expand Down
27 changes: 24 additions & 3 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
list_all_languages,
)
from scribe_data.wikidata.wikidata_utils import sparql
from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump


def get_qid_by_input(input_str):
Expand Down Expand Up @@ -370,7 +371,7 @@ def total_wrapper(
language: Union[str, List[str]] = None,
data_type: Union[str, List[str]] = None,
all_bool: bool = False,
wikidata_dump: str = None,
wikidata_dump: Union[str, bool] = None,
) -> None:
"""
Conditionally provides the full functionality of the total command.
Expand All @@ -387,9 +388,29 @@ def total_wrapper(
all_bool : boolean
Whether all languages and data types should be listed.

wikidata_dump : str
The local Wikidata dump that can be used to process data.
wikidata_dump : Union[str, bool]
The local Wikidata dump path that can be used to process data.
If True, indicates the flag was used without a path.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the care you're putting into the doc strings, @axif0! :)

"""
# Handle --all flag
if all_bool and wikidata_dump:
language = "all"

if wikidata_dump is True: # flag without a wikidata dump path
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)
return

if isinstance(wikidata_dump, str): # if user provided a wikidata dump path
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump,
)
return

if (not language and not data_type) and all_bool:
print_total_lexemes()
Expand Down
1 change: 1 addition & 0 deletions src/scribe_data/resources/data_type_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
"prepositions": "Q4833830",
"pronouns": "Q36224",
"proper_nouns": "Q147276",
"translations": "Q21112633",
"verbs": "Q24905"
}
Loading
Loading