georgian-io · benjaminye · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.5
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ LLM Finetuning toolkit is a config-based CLI tool for launching a series of LLM
 See poetry documentation page for poetry [installation instructions](https://python-poetry.org/docs/#installation)
 
 ```shell
-   poetry install
+   poetry install --without dev
 ```
 
 ### [Option 3] pip
@@ -255,3 +255,10 @@ If you would like to contribute to this project, we recommend following the "for
 5.  Submit a **Pull request** so that we can review your changes
 
 NOTE: Be sure to merge the latest from "upstream" before making a pull request!
+
+### Setting Up Repo for Development
+
+- We recommend using `poetry` to manage dependency
+- Install deps via `poetry install`
+- Enter virtual environment with `poetry shell`
+- Install pre-commit hooks using `pre-commit install`
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,8 +43,31 @@ shellingham = "^1.5.4"
 
 
 [tool.poetry.group.dev.dependencies]
-black = "^24.3.0"
+pre-commit = "~3.7.0"
+ruff = "~0.3.5"
 
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+
+[tool.ruff]
+lint.ignore = ["C901", "E501", "E741", "F402", "F823" ]
+lint.select = ["C", "E", "F", "I", "W"]
+line-length = 119
+exclude = [
+    "llama2",
+    "mistral",
+]
+
+
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["llmtune"]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+
diff --git a/src/data/dataset_generator.py b/src/data/dataset_generator.py
@@ -1,14 +1,11 @@
 import os
-from os.path import join, exists
+import pickle
+import re
 from functools import partial
+from os.path import exists, join
 from typing import Tuple, Union
-import pickle
 
-import re
 from datasets import Dataset
-from rich.console import Console
-from rich.layout import Layout
-from rich.panel import Panel
 
 from src.data.ingestor import Ingestor, get_ingestor
 
@@ -64,12 +61,8 @@ def _format_one_prompt(self, example, is_test: bool = False):
         return example
 
     def _format_prompts(self):
-        self.dataset["train"] = self.dataset["train"].map(
-            partial(self._format_one_prompt, is_test=False)
-        )
-        self.dataset["test"] = self.dataset["test"].map(
-            partial(self._format_one_prompt, is_test=True)
-        )
+        self.dataset["train"] = self.dataset["train"].map(partial(self._format_one_prompt, is_test=False))
+        self.dataset["test"] = self.dataset["test"].map(partial(self._format_one_prompt, is_test=True))
 
     def get_dataset(self) -> Tuple[Dataset, Dataset]:
         self._train_test_split()

diff --git a/src/data/ingestor.py b/src/data/ingestor.py
@@ -1,9 +1,8 @@
+import csv
 from abc import ABC, abstractmethod
-from functools import partial
 
 import ijson
-import csv
-from datasets import Dataset, load_dataset, concatenate_datasets
+from datasets import Dataset, concatenate_datasets, load_dataset
 
 
 def get_ingestor(data_type: str):
@@ -14,9 +13,7 @@ def get_ingestor(data_type: str):
     elif data_type == "huggingface":
         return HuggingfaceIngestor
     else:
-        raise ValueError(
-            f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}"
-        )
+        raise ValueError(f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}")
 
 
 class Ingestor(ABC):

diff --git a/src/finetune/finetune.py b/src/finetune/finetune.py
@@ -1,5 +1,4 @@
 from abc import ABC, abstractmethod
-from typing import Union, List, Tuple, Dict
 
 
 class Finetune(ABC):