Skip to content

Commit cba156a

Browse files
committed
Add Data directory
1 parent 31b818f commit cba156a

20 files changed

+3272
-0
lines changed

Data/Trends_book_system.py

+164
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
from qdrant_searching import QdrantSearcher
2+
from cuda_model import CudaModel
3+
from youtube.youtube_collector import YouTubeColletor
4+
from mysql_insert import Mysql_Manager
5+
6+
import pandas as pd
7+
from itertools import combinations
8+
import time
9+
from tqdm import tqdm
10+
11+
12+
pd.set_option('display.max_columns', 10)
13+
pd.set_option('display.max_rows', 10)
14+
pd.set_option('display.width', None)
15+
pd.set_option('display.max_colwidth', None)
16+
17+
num_books_to_collect = 30
18+
num_main_keywords = 4
19+
# fixed_datetime = '2024-05-14 02:00:00'
20+
namnams = ['2024-05-16 02:00:00']#'2024-05-13 02:00:00', '2024-05-12 02:00:00', '2024-05-11 02:00:00', '2024-05-10 02:00:00','2024-05-09 02:00:00'
21+
22+
def make_clu_sentence(sent, k=4):
23+
words = sent.split()
24+
25+
# 문장에 있는 단어 수가 k 이하라면 원래 문장 반환
26+
if len(words) <= k:
27+
return sent
28+
29+
clu_list = [' '.join(combo) for combo in combinations(words, k)]
30+
31+
return clu_list
32+
33+
34+
if __name__ == "__main__":
35+
process_start_time = time.time()
36+
37+
now_time = time.time()
38+
cuda_model = CudaModel('CPU')
39+
print(f"사용할 디바이스: {cuda_model.device}")
40+
print(f'CUDA 로딩 시간: {time.time() - now_time}')
41+
42+
now_time = time.time()
43+
Qsearcher = QdrantSearcher()
44+
print(f'Qdrant 로딩 시간: {time.time() - now_time}')
45+
46+
now_time = time.time()
47+
youtubeCollector = YouTubeColletor()
48+
print(f'YouTubeColletor 로딩 시간: {time.time() - now_time}')
49+
50+
now_time = time.time()
51+
sql_manager = Mysql_Manager()
52+
print(f'Mysql_Manager 로딩 시간: {time.time() - now_time}')
53+
54+
youtube_category_map = {25: "NEWS", 28: "IT", 15: "ANIMAL", 10:"MUSIC", 24:"ENTERTAINMENT", 0:"NEWMEDIA"}
55+
keywords_id_map = {}
56+
57+
auto_category = [28, 24, 25, 15, 0]#28, 24, 25, 15, 0
58+
59+
for fixed_datetime in namnams:
60+
# while True:
61+
for auto_c in auto_category:
62+
must_not_category = []
63+
# search_mode = int(input('Enter mode(0: find only input sentence, 1: flexible search, 2: Youtube Popular Video): '))
64+
search_mode = 2
65+
66+
if search_mode == 2:
67+
# video_category = int(input('1 : Film & Animation | 10 : Music | 15 : Pets & Animals\n'
68+
# '17 : Sports | 18 : Short Movies | 19 : Travel & Events\n'
69+
# '20 : Gaming | 24 : Entertainment | 25 : News & Politics\n'
70+
# '27 : Education | 28 : Science & Technology | 30 : Movies\n'
71+
# '36 : Drama\n'
72+
# 'Enter youtube categoty:'))
73+
video_category = auto_c
74+
print(f'Start :: {youtube_category_map[auto_c]}')
75+
76+
77+
# must_not_category = input('Enter to must not category').split()
78+
# print(f'제외 시킬 카테고리 : {must_not_category}')
79+
80+
# search_sentence, video_list = youtubeCollector.get_search_keyword_by_popular_videos(video_category)
81+
if video_category == 28:#IT
82+
search_sentence, video_list = youtubeCollector.get_search_IT_video(fixed_datetime[:10])
83+
elif video_category == 24:#ENTERTAINMENT
84+
search_sentence, video_list = youtubeCollector.get_search_ENTERTAINMENT_video(fixed_datetime[:10])
85+
elif video_category == 25:#NEWS
86+
search_sentence, video_list = youtubeCollector.get_search_NEWS_video(fixed_datetime[:10])
87+
elif video_category == 15:#ANIMAL
88+
search_sentence, video_list = youtubeCollector.get_search_ANIMAL_video(fixed_datetime[:10])
89+
elif video_category == 0:#NEWMEDIA
90+
search_sentence, video_list = youtubeCollector.get_search_NEWMEDIA_video(fixed_datetime[:10])
91+
else:
92+
search_sentence, video_list = "", []
93+
94+
#sql 처리 파트
95+
keywords_id_map = {}
96+
#키워드 먼저 넣기
97+
keywords = search_sentence.split()
98+
99+
keyword_rank_cnt = 0
100+
for kw in keywords:
101+
keyword_id = sql_manager.insert_keyword(kw, 1 if keyword_rank_cnt < num_main_keywords else 0, youtube_category_map[video_category], fixed_datetime)
102+
keywords_id_map[kw] = keyword_id
103+
keyword_rank_cnt+=1
104+
105+
#원본 영상 넣기
106+
for video in video_list:
107+
video_id = sql_manager.insert_origin_data(video, "YOUTUBE", youtube_category_map[video_category], fixed_datetime)
108+
109+
#원본 데이터 - 키워드 Join 테이블 채우기
110+
for pkw in video['video_keywords']:
111+
if pkw in keywords_id_map:
112+
sql_manager.insert_trend_source(keywords_id_map[pkw], video_id, fixed_datetime)
113+
#############
114+
else:
115+
search_sentence = input('Enter sentence(exit : ''): ')
116+
117+
now_time = time.time()
118+
results = []
119+
120+
if search_sentence is not '':
121+
print(f'Start Searching {search_sentence}')
122+
if search_mode == 0:
123+
search_vector = cuda_model.model.encode(search_sentence)
124+
125+
results = Qsearcher.search_items(search_vector, search_sentence)
126+
elif search_mode == 1 or search_mode == 2:
127+
search_part_sentences = make_clu_sentence(search_sentence)
128+
129+
tmp_results = []
130+
for search_part_sentence in tqdm(search_part_sentences, desc="Processing"):
131+
search_vector = cuda_model.model.encode(search_part_sentence)
132+
133+
tmp_results.extend(Qsearcher.search_items(search_vector, search_part_sentence.split()))
134+
135+
if len(tmp_results) > 200:
136+
tmp_results = sorted(tmp_results, key=lambda x: x['score'], reverse=True)
137+
tmp_results = tmp_results[:num_books_to_collect]
138+
139+
tmp_results = sorted(tmp_results, key = lambda x: x['score'], reverse=True)
140+
141+
existing_books = set([])
142+
tmp_idx = 0
143+
144+
while len(results) < num_books_to_collect and tmp_idx < len(tmp_results):
145+
if tmp_results[tmp_idx]['book_name'] not in existing_books:
146+
existing_books.add(tmp_results[tmp_idx]['book_name'])
147+
results.append(tmp_results[tmp_idx])
148+
else:
149+
pass
150+
151+
tmp_idx += 1
152+
else:
153+
break
154+
155+
print(f'[{search_sentence} :: {search_mode}] 검색 시간: {time.time() - now_time}')
156+
for result in results:
157+
print(result, end="\n----------------------------\n")
158+
# sql 처리 파트
159+
book_pk = sql_manager.insert_daily_recommend(result['book_id'], fixed_datetime)
160+
161+
for kw in result['searched_keywords']:
162+
sql_manager.insert_recommend_keyword(book_pk, keywords_id_map[kw], fixed_datetime)
163+
164+
print(f'전체 걸린 시간 : {time.time() - process_start_time}')
1.03 KB
Binary file not shown.
5.43 KB
Binary file not shown.
3.03 KB
Binary file not shown.

Data/cuda_model.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
2+
from sentence_transformers import SentenceTransformer
3+
import torch
4+
5+
class CudaModel:
6+
def __init__(self, mode = 'GPU', model_name='sentence-transformers/distiluse-base-multilingual-cased-v2'):
7+
self.device = None
8+
9+
if mode == 'CPU':
10+
self.device = torch.device("cpu")
11+
else:
12+
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
13+
self.model = self.load_model(model_name)
14+
self.move_model_to_device()
15+
16+
def load_model(self, model_name):
17+
model = SentenceTransformer(model_name)
18+
return model
19+
20+
def move_model_to_device(self):
21+
self.model.to(self.device)

Data/google/api.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import MySQLdb
2+
from pytrends.request import TrendReq
3+
from datetime import datetime
4+
5+
# Google Trends 객체 생성
6+
pytrends = TrendReq(hl='ko-KR', tz=540)
7+
8+
# 현재 날짜 가져오기
9+
current_date = datetime.now().strftime('%Y-%m-%d')
10+
11+
# 트렌드하는 검색어 가져오기
12+
df = pytrends.trending_searches(pn='south_korea')
13+
14+
# 결과 출력
15+
print("현재 날짜:", current_date)
16+
print(df)

Data/html_filter_test.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from bs4 import BeautifulSoup
2+
import json
3+
from tqdm import tqdm
4+
5+
def remove_html_tags(text):
6+
# BeautifulSoup 객체를 생성하여 HTML 태그를 파싱합니다.
7+
soup = BeautifulSoup(text, "html.parser")
8+
# .get_text() 메소드를 사용하여 모든 태그를 제거하고 순수 텍스트만 추출합니다.
9+
return soup.get_text()
10+
11+
12+
13+
14+
if __name__ == '__main__':
15+
# JSON 파일 경로
16+
file_path = "C:/Users/SSAFY/jupyter/S10P31S203/book_yes24.json"
17+
18+
with open(file_path, "r", encoding="utf-8") as json_file:
19+
cnt = 0
20+
for line in tqdm(json_file, desc="Processing"):
21+
if cnt > 3:
22+
break
23+
24+
clean_text = remove_html_tags(json.loads(line)['contents'])
25+
26+
print(clean_text)

Data/live_searching_api.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from qdrant_searching import QdrantSearcher
2+
from cuda_model import CudaModel
3+
from mysql_insert import Mysql_Manager
4+
import time
5+
6+
7+
class LiveBookSearcher:
8+
def __init__(self):
9+
now_time = time.time()
10+
self.cuda_model = CudaModel('CPU')
11+
print(f'CUDA 로딩 시간: {time.time() - now_time}')
12+
13+
now_time = time.time()
14+
self.Qsearcher = QdrantSearcher()
15+
print(f'Qdrant 로딩 시간: {time.time() - now_time}')
16+
17+
now_time = time.time()
18+
self.sql_manager = Mysql_Manager()
19+
print(f'Mysql_Manager 로딩 시간: {time.time() - now_time}')
20+
21+
22+
#For Fast API
23+
def live_keyword_searching(self, search_sentence):
24+
if search_sentence is not '':
25+
print(f'Start Searching {search_sentence}')
26+
27+
search_vector = self.cuda_model.model.encode(search_sentence)
28+
29+
search_results = self.Qsearcher.search_items(search_vector, search_sentence, 10)
30+
31+
results = []
32+
for search_result in search_results:
33+
results.append(search_result['book_id'])
34+
35+
return results
36+
37+
return None
38+
39+
40+
#For Fast API
41+
def memorial_book_searching(self, memorial_book):# memorial_book :: product_id
42+
if memorial_book is not None:
43+
# 연관 책 추천 :: 현재는 DB의 모든 책들 중에서 검색,
44+
# 추후에 같은 질문을 선택한 사람들이 선택한 책중으로 바뀔 예정
45+
results = self.Qsearcher.find_memorial_book(memorial_book)
46+
47+
'''
48+
DB에 SQL로 결과를 업로드할 부분 :: 테이블 미완성으로 대기
49+
self.sql_manager.{TBD}
50+
'''
51+
52+
return results
53+
54+
return None
55+
56+
def memorial_book_searching_real_service(self, memorial_book, question_id):# memorial_book :: product_id
57+
if memorial_book is not None:
58+
# 연관 책 추천 :: 현재는 DB의 모든 책들 중에서 검색,
59+
# 추후에 같은 질문을 선택한 사람들이 선택한 책중으로 바뀔 예정
60+
results = self.Qsearcher.find_memorial_book_real_service(memorial_book, question_id)
61+
62+
'''
63+
DB에 SQL로 결과를 업로드할 부분 :: 테이블 미완성으로 대기
64+
self.sql_manager.{TBD}
65+
'''
66+
67+
return results
68+
69+
return None
70+

0 commit comments

Comments
 (0)