Skip to content

Commit 17ef4b7

Browse files
committed
add dataset
1 parent 16bcdd3 commit 17ef4b7

File tree

3 files changed

+134
-0
lines changed

3 files changed

+134
-0
lines changed

data/VQA_BLIND_KO/VQA_BLIND_KO.md

Whitespace-only changes.

data/VQA_BLIND_KO/split.ipynb

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"## 1. Split the train dataset file"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import pandas as pd\n",
17+
"\n",
18+
"file_path = 'train.xlsx'\n",
19+
"df = pd.read_excel(file_path)\n",
20+
"split_n = 10\n",
21+
"\n",
22+
"rows_per_file = len(df) // split_n \n",
23+
"remaining_rows = len(df) % split_n\n",
24+
"\n",
25+
"start_row = 0\n",
26+
"\n",
27+
"for i in range(split_n):\n",
28+
" end_row = start_row + rows_per_file\n",
29+
" if remaining_rows > 0:\n",
30+
" end_row += 1\n",
31+
" remaining_rows -= 1\n",
32+
" split_df = df.iloc[start_row:end_row, [0,2,4,5]]\n",
33+
" split_df.to_excel(f'train_{i+1}.xlsx', index=False)\n",
34+
" start_row = end_row"
35+
]
36+
},
37+
{
38+
"cell_type": "markdown",
39+
"metadata": {},
40+
"source": [
41+
"## 2. Translate the document from Korean to English using \"[Google Translate](\"https://translate.google.co.kr/?sl=auto&tl=ko&op=translate\")\""
42+
]
43+
},
44+
{
45+
"cell_type": "markdown",
46+
"metadata": {},
47+
"source": [
48+
"## 3. Merge the files and save in JSON format"
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": 40,
54+
"metadata": {},
55+
"outputs": [
56+
{
57+
"name": "stdout",
58+
"output_type": "stream",
59+
"text": [
60+
"Columns in final_df: Index(['image_id', ' category', ' question', ' answer', 'image'], dtype='object')\n"
61+
]
62+
}
63+
],
64+
"source": [
65+
"import pandas as pd\n",
66+
"import json\n",
67+
"\n",
68+
"# 엑셀 파일 읽기\n",
69+
"file_path = 'train.xlsx'\n",
70+
"train_df = pd.read_excel(file_path)\n",
71+
"split_n = 10\n",
72+
"\n",
73+
"# 병합할 파일 목록\n",
74+
"file_list = [f'train_en_{i+1}.xlsx' for i in range(split_n)]\n",
75+
"\n",
76+
"# 빈 데이터프레임 생성\n",
77+
"merged_df = pd.DataFrame()\n",
78+
"\n",
79+
"# 각 파일을 읽어서 병합\n",
80+
"for file in file_list:\n",
81+
" df_en = pd.read_excel(file)\n",
82+
" merged_df = pd.concat([merged_df, df_en], ignore_index=True)\n",
83+
"\n",
84+
"train_columns = ['image_id', 'image']\n",
85+
"\n",
86+
"train_df = train_df[train_columns]\n",
87+
"final_df = pd.merge(merged_df, train_df, on='image_id', how='left')\n",
88+
"final_df = final_df.drop_duplicates()\n",
89+
"print(\"Columns in final_df:\", final_df.columns)\n",
90+
"\n",
91+
"json_list = []\n",
92+
"for _, row in final_df.iterrows():\n",
93+
" json_entry = {\n",
94+
" \"image\": row['image'],\n",
95+
" \"question\": row[' question'].strip(),\n",
96+
" \"answers\": [\n",
97+
" {\n",
98+
" \"answer\": str(row[' answer']).strip(),\n",
99+
" \"answer_confidence\": \"yes\" \n",
100+
" },\n",
101+
" ],\n",
102+
" \"answer_type\": row[' category'].strip(), # 예제 데이터에 따라 변경 가능\n",
103+
" \"answerable\": 1 # 예제 데이터에 따라 변경 가능\n",
104+
" }\n",
105+
" json_list.append(json_entry)\n",
106+
"\n",
107+
"# JSON 형식으로 저장\n",
108+
"with open('train_en.json', 'w') as json_file:\n",
109+
" json.dump(json_list, json_file, indent=2)"
110+
]
111+
}
112+
],
113+
"metadata": {
114+
"kernelspec": {
115+
"display_name": "boda2",
116+
"language": "python",
117+
"name": "python3"
118+
},
119+
"language_info": {
120+
"codemirror_mode": {
121+
"name": "ipython",
122+
"version": 3
123+
},
124+
"file_extension": ".py",
125+
"mimetype": "text/x-python",
126+
"name": "python",
127+
"nbconvert_exporter": "python",
128+
"pygments_lexer": "ipython3",
129+
"version": "3.11.7"
130+
}
131+
},
132+
"nbformat": 4,
133+
"nbformat_minor": 2
134+
}

data/VizWiz-VQA/VizWiz-VQA.md

Whitespace-only changes.

0 commit comments

Comments
 (0)