preparing code

sooh-J · sooh-J · commit 47e0896e7006 · 2024-06-27T17:08:09.000+09:00
diff --git a/src/prepare_data.py b/src/prepare_data.py
@@ -0,0 +1,40 @@
+import torch
+import skimage.io as io
+from PIL import Image
+import os
+
+# modify https://github.com/dino-chiio/blip-vqa-finetune/blob/main/finetuning.py
+class VQADataset(torch.utils.data.Dataset):
+    """VQA (v2) dataset."""
+
+    def __init__(self, dataset, processor, img_path=""):
+        self.dataset = dataset
+        self.processor = processor
+        self.img_path = img_path
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        # get image + text
+        question = self.dataset[idx]['question']
+        answer = self.dataset[idx]['answer']
+        image_file = self.dataset[idx]['image']
+        image_path = os.path.join(self.img_path, image_file)
+        image = Image.open(image_path).convert("RGB")
+        text = question
+        
+        encoding = self.processor(image, text, 
+                                  max_length= 512, pad_to_max_length=True,
+                                  # padding="max_length", truncation=True, 
+                                  return_tensors="pt")
+        labels = self.processor.tokenizer.encode(
+            answer, max_length= 8, pad_to_max_length=True, return_tensors='pt'
+        )
+        encoding["labels"] = labels
+        
+        # remove batch dimension
+        for k,v in encoding.items():
+            encoding[k] = v.squeeze()
+            
+        return encoding
diff --git a/src/preprocessing.py b/src/preprocessing.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+import os
+
+from datasets import load_dataset
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import numpy as np
+import os
+import argparse
+
+import json
+from shutil import copyfile
+
+copyfile(src = os.path.join("/kaggle/input/vizwiz-dataset", 'vqa.py'), dst = os.path.join("../working", 'vqa.py'))
+copyfile(src = os.path.join("/kaggle/input/vizwiz-dataset", 'prepare_data.py'), dst = os.path.join("../working", 'prepare_data.py'))
+
+from vqa import *
+from prepare_data import *
+
+BASE_MODEL = "Salesforce/blip2-opt-2.7b"
+import random
+
+def split_dataset(dataset, train_ratio=0.7, valid_ratio=0.1, test_ratio=0.2):
+    # Shuffle the dataset
+    random.shuffle(dataset)
+
+    # Calculate split indices
+    total_size = len(dataset)
+    train_size = int(total_size * train_ratio)
+    valid_size = int(total_size * valid_ratio)
+    test_size = total_size - train_size - valid_size
+
+    # Split the dataset
+    train_set = dataset[:train_size]
+    valid_set = dataset[train_size:train_size + valid_size]
+    test_set = dataset[train_size + valid_size:]
+
+    return train_set, valid_set, test_set
+
+def load_dataset_vizwiz(data_path="/kaggle/input/vizwiz"):
+    INPUT_PATH = data_path
+    IMG_PATH = INPUT_PATH 
+    ANNOTATIONS = INPUT_PATH + '/Annotations/Annotations'
+    TRAIN_PATH = INPUT_PATH + '/train/train'
+    VALIDATION_PATH = INPUT_PATH + '/val/val'
+    TEST_PATH = INPUT_PATH + '/test/test'
+    ANNOTATIONS_TRAIN_PATH = ANNOTATIONS + '/train.json'
+    ANNOTATIONS_VAL_PATH = ANNOTATIONS + '/val.json'
+    ANNOTATIONS_TEST_PATH = ANNOTATIONS + '/test.json'
+
+    annFile = ANNOTATIONS_TRAIN_PATH
+    imgDir = TRAIN_PATH
+
+    # initialize VQA api for QA annotations
+    data_VQA = {d_type:None for d_type in ['train','valid','test']}
+    for d_type, a_path, d_path in zip(['train','valid','test'],
+                            [TRAIN_PATH,VALIDATION_PATH,TEST_PATH], 
+                            [ANNOTATIONS_TRAIN_PATH,ANNOTATIONS_VAL_PATH,ANNOTATIONS_TEST_PATH]):
+        annFile = d_path
+        imgDir = a_path
+
+        # initialize VQA api for QA annotations
+        vqa=VQA(annFile)
+        
+        # load and display QA annotations for given answer types
+        """
+        ansTypes can be one of the following
+        yes/no
+        number
+        other
+        unanswerable
+        """
+        anns = vqa.getAnns(ansTypes=['other','yes/no','number']);  
+        anns = vqa.getBestAnns(ansTypes=['other','yes/no','number']);  
+
+        data_VQA[d_type] = anns
+
+    train_n, valid_n = len(data_VQA['train']), len(data_VQA['valid'])
+
+    data_VQA['train'] = data_VQA['train'][:10000]
+    data_VQA['valid'] = data_VQA['valid'][:1000]
+    print("Training sets: {}->{} - Validating set: {}->{}".format(train_n, len(data_VQA['train']), valid_n, len(data_VQA['valid'])))
+
+    return data_VQA, TRAIN_PATH, VALIDATION_PATH
+    # train_dataset = VQADataset(dataset=data_VQA['train'],
+    #                         processor=processor,
+    #                         img_path=TRAIN_PATH)
+    # valid_dataset = VQADataset(dataset=data_VQA['valid'],
+    #                         processor=processor,
+    #                         img_path=VALIDATION_PATH)
+
+    # batch_size = 1
+    # train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
+    # valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
+
+    # return train_dataloader, valid_dataloader
+
+def load_dataset_kvqa(data_path:str="/kaggle/input/vqa-blind-ko"):
+    INPUT_PATH = data_path
+    TRAIN_PATH = INPUT_PATH + '/VQA_train/images'
+    ANNOTATIONS = INPUT_PATH 
+    TEST_PATH = INPUT_PATH + '/VQA_test/task07_images'
+    ANNOTATIONS_TRAIN_PATH = ANNOTATIONS + '/train_en.json'
+    ANNOTATIONS_TEST_PATH = TEST_PATH + '/test.json'
+
+    annFile = ANNOTATIONS_TRAIN_PATH
+
+    # initialize VQA api for QA annotations
+    vqa=VQA(annFile)
+    
+    # load and display QA annotations for given answer types
+    """
+    ansTypes can be one of the following
+    yes/no
+    number
+    other
+    unanswerable
+    """
+    anns = vqa.getAnns();  
+    anns = vqa.getBestAnns();  
+
+    # Split the dataset into train, validation, and test sets
+    train_set, valid_set, test_set = split_dataset(anns)
+    train_n, valid_n = len(train_set), len(valid_set)
+    train_set = train_set[:20000]
+    valid_set = valid_set[:2000]
+    data_VQA = {
+        'train': train_set,
+        'valid': valid_set,
+        'test': test_set
+    }
+    print("Training sets: {}->{} - Validating set: {}->{}".format(train_n, len(data_VQA['train']), valid_n, len(data_VQA['valid'])))
+
+    return data_VQA, TRAIN_PATH, TRAIN_PATH
+
+# if __name__ == "__main__":
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--kaggle", type=bool, required=True, default=True)
+#     parser.add_argument("--vizwiz_path", type=str, required=True, default="/kaggle/input/vizwiz")
+#     parser.add_argument("--kvqa_path", type=str, required=True, default="/kaggle/input/vqa-blind-ko")
+#     parser.add_argument("--lib_path", type=str, required=True, default="/kaggle/input/vizwiz-dataset")
+#     args = parser.parse_args()
+
+#     # load_dataset()
+#     if args.kaggle:
+#         from shutil import copyfile
+#         copyfile(src = os.path.join(args.lib_path, 'vqa.py'), dst = os.path.join("../working", 'vqa.py'))
+
+#         from vqa import *
+#         load_dataset()
diff --git a/src/vqa.py b/src/vqa.py
@@ -0,0 +1,120 @@
+__author__ = 'QingLi'
+__version__ = '1.0'
+## modified by sooh-J
+# Interface for accessing the VQA dataset.
+
+# This code is based on the code written by Qing Li for VizWiz Python API available at the following link: 
+# (https://github.com/xxx)
+
+# The following functions are defined:
+#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
+#  getQuesIds - Get question ids that satisfy given filter conditions.
+#  getImgIds  - Get image ids that satisfy given filter conditions.
+#  loadQA    - Load questions and answers with the specified question ids.
+#  showQA    - Display the specified questions and answers.
+#  loadRes    - Load result file and create result object.
+
+# Help on each function can be accessed by: "help(COCO.function)"
+
+import json
+import datetime
+import copy
+import random
+import skimage.io as io
+import matplotlib.pyplot as plt
+import os
+from collections import Counter
+
+def export_max_value(ans):
+    ans_cnt = Counter(ans)
+    
+    max_cnt = max(ans_cnt.values())
+    max_ans = max([k for k,v in ans_cnt.items() if v == max_cnt], key=len)
+    
+    return max_ans
+
+class VQA:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of VQA helper class for reading and visualizing questions and answers.
+        :param annotation_file (str): location of VQA annotation file
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.imgToQA = {}
+        if annotation_file != None:
+            print('loading dataset into memory...')
+            time_t = datetime.datetime.utcnow()
+            dataset = json.load(open(annotation_file, 'r'))
+            print(datetime.datetime.utcnow() - time_t)
+            self.dataset = dataset
+            self.imgToQA = {x['image']:x for x in dataset}
+        self.anns = {}
+
+    def getImgs(self):
+        return list(self.imgToQA.keys())
+
+    def getAnns(self, imgs=[], ansTypes=[]):
+        """
+        Get annotations that satisfy given filter conditions. default skips that filter
+        :param  imgs (str array): get annotations for given image names
+              ansTypes  (str array)   : get annotations for given answer types
+        :return: annotations  (dict array)   : dict array of annotations
+        """
+        anns = self.dataset
+        
+        imgs = imgs if type(imgs) == list else [imgs]
+        if len(imgs) != 0:
+            anns = [self.imgToQA[img] for img in imgs]
+        
+        ansTypes  = ansTypes  if type(ansTypes)  == list else [ansTypes]
+        if len(ansTypes) != 0:
+            anns = [ann for ann in anns if ann.get('answer_type',"None") in ansTypes]
+       
+        self.anns = anns
+        return anns
+
+    def getBestAnns(self, imgs=[], ansTypes=[]):
+        """
+        code by SOOH-J 
+        Filter the best answer(only one answer with confidence)
+        :param  imgs (str array): get annotations for given image names
+              ansTypes  (str array)   : get annotations for given answer types
+        :return: annotations  (dict array)   : dict array of annotations
+        """
+        try:
+            anns = self.anns
+        except:
+            anns = self.getAnns(imgs, ansTypes)
+            
+        # include only answers with confidence
+        confidence_anns = []
+        for ann in anns:
+            confidence_ann = [an for an in ann['answers'] if an.get('answer_confidence') == 'yes']
+            if confidence_ann:
+                ann_copy = ann.copy() 
+                ann_copy['answers'] = confidence_ann
+                try:
+                    confidence_answers = [con_ans['answer'] for con_ans in confidence_ann if con_ans['answer'] not in ['unanswerable','unsuitable image','unsuitable']]
+                    ann_copy['answer'] = export_max_value(confidence_answers)
+                except:
+                    continue
+                confidence_anns.append(ann_copy)
+            continue
+        return confidence_anns
+
+    def showQA(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+             return 0
+        for ann in anns:
+            print("Question: %s"%ann['question'])
+            print("Answer: ")
+            print('\n'.join([f"\tanswer:{x['answer']}, confidence:{x['answer_confidence']}" for x in ann['answers']]))
+            print("SOOOO the best answer:")
+            print(f"\t{ann['answer']}")