feat: acclerate for big model

numb3r3 · numb3r3 · commit 025cb0071968 · 2023-04-14T11:22:49.000+08:00
diff --git a/opengpt/factory.py b/opengpt/factory.py
@@ -2,6 +2,7 @@
 from typing import Optional, Union
 
 import torch
+from loguru import logger
 
 
 def list_models():
@@ -35,6 +36,8 @@ def create_model_and_transforms(
     # TODO: Add support for loading config based on model name
     model_config = {}
 
+    logger.debug(f'Loading model: {model_name}')
+
     if model_name == 'OpenFlamingo-9B':
         from .models.flamingo.loading import load_model_and_transforms
 
@@ -44,5 +47,13 @@ def create_model_and_transforms(
             'tokenizer_name_or_path': 'llama_7B',
         }
         return load_model_and_transforms(**model_config)
+    elif model_name.startswith('facebook/llama'):
+        from .models.llama.loading import load_model_and_tokenizer
+
+        model_config = {
+            'model_name_or_path': 'llama_7B',
+            'tokenizer_name_or_path': 'llama_7B',
+        }
+        return load_model_and_tokenizer(**model_config)
     else:
         raise ValueError(f'Unknown model name: {model_name}')
diff --git a/opengpt/helper.py b/opengpt/helper.py
@@ -1,3 +1,25 @@
+import sys
+
+from loguru import logger
+
+
+def setup_logging(debug: bool):
+    """
+    Setup the log formatter for AnnLite.
+    """
+
+    log_level = 'INFO'
+    if debug:
+        log_level = 'DEBUG'
+
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        colorize=True,
+        level=log_level,
+    )
+
+
 def get_envs():
     from torch.utils import collect_env
 
diff --git a/opengpt/models/flamingo/loading.py b/opengpt/models/flamingo/loading.py
@@ -2,7 +2,7 @@
 from open_flamingo.src.flamingo_lm import FlamingoLMMixin
 from open_flamingo.src.utils import extend_instance
 
-from ..hf_model import load_model_and_tokenizer
+from ..llama.loading import load_model_and_tokenizer
 from .modeling import FlamingoModel
 
 
diff --git a/opengpt/models/llama/loading.py b/opengpt/models/llama/loading.py
@@ -1,12 +1,37 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import TYPE_CHECKING, Union
 
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
-def hf_load_model_and_tokenizer(model_name_or_path: str, tokenizer_name_or_path: str):
+if TYPE_CHECKING:
+    import torch
+
+from loguru import logger
+
+
+def load_model_and_tokenizer(
+    model_name_or_path: str,
+    tokenizer_name_or_path: str,
+    dtype: Union[str, 'torch.dtype'] = 'torch.float16',
+    **kwargs
+):
     """Load a model and tokenizer from HuggingFace."""
     tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path, local_files_only=False
+        tokenizer_name_or_path, local_files_only=True
     )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name_or_path, local_files_only=False
+
+    # Create a model and initialize it with empty weights
+    config = AutoConfig.from_pretrained(model_name_or_path, local_files_only=True)
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_config(config)
+
+    # Load the checkpoint and dispatch it to the right devices
+    model = load_checkpoint_and_dispatch(
+        model, model_name_or_path, device_map="auto", dtype=dtype, **kwargs
     )
+
+    # model = AutoModelForCausalLM.from_pretrained(
+    #     model_name_or_path, local_files_only=False
+    # )
     return model, tokenizer
diff --git a/opengpt/profile.py b/opengpt/profile.py
@@ -0,0 +1,101 @@
+"""This file contains a few functions to profile the memory usage of the model.
+
+It is not meant to be used in production, but rather to help us debug the memory usage of the model.
+
+The codes are borrowed from https://github.com/huggingface/accelerate/blob/main/benchmarks/measures_util.py
+"""
+
+import gc
+import threading
+import time
+
+import psutil
+import torch
+from accelerate.utils import compute_module_sizes
+
+
+class PeakCPUMemory:
+    def __init__(self):
+        self.process = psutil.Process()
+        self.peak_monitoring = False
+
+    def peak_monitor(self):
+        self.cpu_memory_peak = -1
+
+        while True:
+            self.cpu_memory_peak = max(
+                self.process.memory_info().rss, self.cpu_memory_peak
+            )
+
+            # can't sleep or will not catch the peak right (this comment is here on purpose)
+            if not self.peak_monitoring:
+                break
+
+    def start(self):
+        self.peak_monitoring = True
+        self.thread = threading.Thread(target=self.peak_monitor)
+        self.thread.daemon = True
+        self.thread.start()
+
+    def stop(self):
+        self.peak_monitoring = False
+        self.thread.join()
+        return self.cpu_memory_peak
+
+
+cpu_peak_tracker = PeakCPUMemory()
+
+
+def start_measure():
+    # Time
+    measures = {"time": time.time()}
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # CPU mem
+    measures["cpu"] = psutil.Process().memory_info().rss
+    cpu_peak_tracker.start()
+
+    # GPU mem
+    for i in range(torch.cuda.device_count()):
+        measures[str(i)] = torch.cuda.memory_allocated(i)
+    torch.cuda.reset_peak_memory_stats()
+
+    return measures
+
+
+def end_measure(start_measures):
+    # Time
+    measures = {"time": time.time() - start_measures["time"]}
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # CPU mem
+    measures["cpu"] = (
+        psutil.Process().memory_info().rss - start_measures["cpu"]
+    ) / 2**20
+    measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20
+
+    # GPU mem
+    for i in range(torch.cuda.device_count()):
+        measures[str(i)] = (
+            torch.cuda.memory_allocated(i) - start_measures[str(i)]
+        ) / 2**20
+        measures[f"{i}-peak"] = (
+            torch.cuda.max_memory_allocated(i) - start_measures[str(i)]
+        ) / 2**20
+
+    return measures
+
+
+def log_measures(measures, description):
+    print(f"{description}:")
+    print(f"- Time: {measures['time']:.2f}s")
+    for i in range(torch.cuda.device_count()):
+        print(f"- GPU {i} allocated: {measures[str(i)]:.2f}MiB")
+        peak = measures[f"{i}-peak"]
+        print(f"- GPU {i} peak: {peak:.2f}MiB")
+    print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB")
+    print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ build-backend = "poetry.core.masonry.api"
 # Compatible Python versions
 python = ">=3.8"
 torch = ">=1.9.0,<2.0.0" # a meta device requires torch >= 1.9.0
+loguru = "^0.5"
 click = "^8.1.3"
 numpy = "^1.21.2"
 einops = "^0.6.0"
diff --git a/scripts/upload_to_s3.py b/scripts/upload_to_s3.py
@@ -0,0 +1,67 @@
+# pip install boto3 hf_transfer
+
+import os
+from math import ceil
+from time import time
+
+import boto3
+from hf_transfer import multipart_upload
+
+# 10 MiB
+CHUNK_SIZE = 10_485_760
+
+s3 = boto3.client("s3")
+
+bucket = "test-hf-transfer-multi-part-upload"
+bucket_key = "some_file"
+
+upload = s3.create_multipart_upload(
+    ACL="bucket-owner-full-control",
+    Bucket=bucket,
+    Key=bucket_key,
+)
+upload_id = upload["UploadId"]
+print("created multipart upload")
+
+file_name = "some_file"
+file_size = os.stat(file_name).st_size
+
+urls = []
+nb_parts = ceil(file_size / CHUNK_SIZE)
+for part_number in range(1, nb_parts + 1):
+    params = {
+        "Bucket": bucket,
+        "Key": bucket_key,
+        "PartNumber": part_number,
+        "UploadId": upload_id,
+    }
+    urls.append(
+        s3.generate_presigned_url(
+            ClientMethod="upload_part", Params=params, ExpiresIn=86400
+        )
+    )
+print("prepared parts urls")
+
+print("uploading parts...")
+start = time()
+responses = multipart_upload(
+    file_path=file_name,
+    parts_urls=urls,
+    chunk_size=CHUNK_SIZE,
+    max_files=64,
+    parallel_failures=63,
+    max_retries=5,
+)
+print(f"uploaded parts in {time() - start}")
+
+etag_with_parts = []
+for part_number, header in enumerate(responses):
+    etag = header.get("etag")
+    etag_with_parts.append({"ETag": etag, "PartNumber": part_number + 1})
+
+parts = {"Parts": etag_with_parts}
+
+s3.complete_multipart_upload(
+    Bucket=bucket, Key=bucket_key, MultipartUpload=parts, UploadId=upload_id
+)
+print("upload complete")