microsoft · riedgar-ms · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/azureml/components/jsonl_gsm8k_fetch_component.yaml b/azureml/components/jsonl_gsm8k_fetch_component.yaml
@@ -0,0 +1,32 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+
+name: jsonl_gsm8k_fetch
+version: 0.0.1pre1
+display_name: JSONL GSM8K Fetcher
+type: command
+description: Fetches the GSM8K dataset, and formats into JSONL
+is_deterministic: true
+
+inputs:
+  output_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the output datasets
+
+outputs:
+  output_dataset:
+    type: uri_folder
+    description: |
+      Folder which will contain 'train.jsonl' and 'test.jsonl'
+
+code: ./src/
+
+command: >-
+  python ./jsonl_gsm8k_fetch.py
+  --output_encoding ${{ inputs.output_encoding }}
+  --output_dataset ${{ outputs.output_dataset }}
+
+environment:
+  # Will be updated when component uploads
+  image: azureml:promptbase_aml@latest
diff --git a/azureml/components/jsonl_guidance_mistral7b_component.yaml b/azureml/components/jsonl_guidance_mistral7b_component.yaml
@@ -0,0 +1,90 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+
+name: jsonl_guidance_mistral7b
+version: 0.0.1pre1
+display_name: JSONL Guidance Mistral7B
+type: command
+description: Runs a supplied Guidance program on every line of a JSONL file via Mistral7B
+is_deterministic: false
+
+inputs:
+  guidance_program:
+    type: uri_file
+    optional: false
+    description: Python file containing the guidance program
+  input_dataset:
+    type: uri_file
+    optional: false
+    description: Dataset containing JSONL input
+  input_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the input dataset
+  common_dataset:
+    type: uri_file
+    optional: true
+    description: Dataset containing data to be shared with all rows in input
+  common_encoding:
+    type: string
+    optional: true
+    default: utf-8-sig
+    description: Encoding format of the common dataset
+  output_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the output dataset
+  error_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the error dataset
+
+outputs:
+  output_dataset:
+    type: uri_file
+    description: JSONL file
+  error_dataset:
+    type: uri_file
+    description: JSONL file containing failed lines
+
+code: ./src/
+
+command: |
+  # Install Rust toolchain
+  #apt update
+  #apt upgrade -y
+  #apt install -y rustc build-essential
+  #pip install setup-rust
+  # Download the zip
+  wget https://github.com/guidance-ai/guidance/archive/refs/heads/main.zip
+  echo
+  ls
+  echo
+  # Unzip
+  unzip ./main.zip
+  echo
+  ls -p
+  echo
+  # Install from download
+  pip install --upgrade ./guidance-main/
+  echo
+  # Install LlamaCpp
+  CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python<0.2.58"
+  echo
+  # Run the script
+  python ./jsonl_guidance_mistral7b.py \
+    --guidance_program ${{ inputs.guidance_program }} \
+    --input_dataset ${{ inputs.input_dataset }} \
+    --input_encoding ${{ inputs.input_encoding }} \
+    $[[--common_dataset ${{ inputs.common_dataset }} ]] \
+    $[[--common_encoding ${{ inputs.common_encoding }} ]] \
+    --output_dataset ${{ outputs.output_dataset }} \
+    --output_encoding ${{ inputs.output_encoding }} \
+    --error_dataset ${{ outputs.error_dataset }} \
+    --error_encoding ${{ inputs.error_encoding }}
+
+environment:
+  # Will be updated when component uploads
+  image: azureml:guidance_phi2_env@latest
diff --git a/azureml/components/jsonl_sample_lines_component.yaml b/azureml/components/jsonl_sample_lines_component.yaml
@@ -0,0 +1,52 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+
+name: jsonl_sample_lines
+display_name: 'JSONL Sample Lines'
+type: command
+description: |
+  Samples lines (without replacement) from a JSONL file
+is_deterministic: true
+
+inputs:
+  input_dataset:
+    type: uri_file
+    optional: false
+    description: Dataset containing JSONL input
+  input_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the input dataset
+  n_samples:
+    type: integer
+    optional: false
+    description: Number of samples required
+  random_seed:
+    type: integer
+    optional: false
+    description: Seed for Pythons PRNG
+  output_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the output dataset
+
+outputs:
+  output_dataset:
+    type: uri_file
+    description: Dataset containing sampled JSONL
+
+code: ./src
+
+command: >-
+  python ./jsonl_sample_lines.py
+  --input_dataset ${{ inputs.input_dataset }}
+  --input_encoding ${{ inputs.input_encoding }}
+  --n_samples ${{ inputs.n_samples }}
+  --random_seed ${{ inputs.random_seed }}
+  --output_dataset ${{ outputs.output_dataset }}
+  --output_encoding ${{ inputs.output_encoding }}
+
+environment:
+  # Will be updated when component uploads
+  image: azureml:promptbase_aml@latest
diff --git a/azureml/components/jsonl_score_numeric_component.yaml b/azureml/components/jsonl_score_numeric_component.yaml
@@ -0,0 +1,56 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+
+name: jsonl_score_numeric
+version: 0.0.1pre1
+display_name: JSONL Numeric Scorer
+type: command
+description: |
+  Takes a JSONL file of numeric questions and correct answers and responses
+  from a model, and produces the overall score.
+  Results are stored in JSON
+is_deterministic: true
+
+inputs:
+  input_dataset:
+    type: uri_file
+    optional: false
+    description: Dataset containing JSONL input
+  input_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the input dataset
+  correct_key:
+    type: string
+    optional: false
+    description: Which key contains the correct answer
+  response_key:
+    type: string
+    optional: false
+    description: Which key contains the answer produced by the model
+  output_encoding:
+    type: string
+    optional: false
+    default: utf-8-sig
+    description: Encoding format of the output dataset
+
+outputs:
+  output_dataset:
+    type: uri_file
+    description: JSON file containing score summary
+
+
+code: ./src/
+
+command: >-
+  python ./jsonl_score_numeric.py
+  --input_dataset ${{ inputs.input_dataset }}
+  --input_encoding ${{ inputs.input_encoding }}
+  --output_dataset ${{ outputs.output_dataset }}
+  --output_encoding ${{ inputs.output_encoding }}
+  --correct_key ${{ inputs.correct_key }}
+  --response_key ${{ inputs.response_key }}
+
+environment:
+  # Will be updated when component uploads
+  image: azureml:promptbase_aml@latest
diff --git a/azureml/components/src/jsonl_gsm8k_fetch.py b/azureml/components/src/jsonl_gsm8k_fetch.py
@@ -0,0 +1,96 @@
+import argparse
+import json
+import pathlib
+import re
+
+from typing import Any, Dict
+
+import requests
+
+
+from aether_utils.jsonl_file_utils import JSONLWriter, JSONLReader
+from aether_utils.logging_utils import get_standard_logger_for_file
+
+_logger = get_standard_logger_for_file(__file__)
+
+BASE_DATA_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
+
+SPLITS = ["train", "test"]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(add_help=True)
+
+    # Information about the ports
+    ports_group = parser.add_argument_group("Ports")
+    ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True)
+    ports_group.add_argument("--output_encoding", type=str, required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def extract_thought_parts(thought: str) -> Dict[str, Any]:
+    thought_re = r"(.*)<<(.*=.*)>>(.*)"
+    match = re.match(thought_re, thought)
+
+    result = dict()
+    if match:
+        result["step"] = match.group(1)
+        result["calculation"] = match.group(2)
+        result["result"] = match.group(3)
+    else:
+        result["step"] = thought
+    return result
+
+
+def process_line(item: Dict[str, Any]) -> Dict[str, Any]:
+    result = dict()
+    _logger.debug(f"Processing {item}")
+
+    result["question"] = item["question"]
+
+    # The answer embeds a chain of thought and the
+    # numeric result
+    split_answer = item["answer"].split("####")
+
+    result["thoughts"] = []
+    for thought in split_answer[0].splitlines():
+        result["thoughts"].append(extract_thought_parts(thought))
+
+    # The following is not how you're supposed to handle
+    # numbers with thousand separators.
+    # This is a work around, pending three-way negotiations
+    # with locale.atof() and the AzureML compute nodes
+    result["answer"] = float(split_answer[1].replace(",", ""))
+
+    return result
+
+
+def main():
+    args = parse_args()
+
+    for split in SPLITS:
+        _logger.info(f"Starting split {split}")
+        line_count = 0
+        target_url = f"{BASE_DATA_URL}{split}.jsonl"
+
+        _logger.info(f"Fetching {target_url}")
+        response = requests.get(target_url)
+        assert response.status_code == 200, f"Got response {response}"
+
+        with JSONLWriter(
+            args.output_dataset / f"{split}.jsonl", args.output_encoding
+        ) as jlw:
+            for line in response.text.splitlines():
+                nxt_item = json.loads(line)
+                output_item = process_line(nxt_item)
+                jlw.write_line(output_item)
+                line_count += 1
+        _logger.info(f"Completed split {split} ({line_count} lines)")
+
+    _logger.info("Complete")
+
+
+if __name__ == "__main__":
+    main()