|
| 1 | +from qdrant_searching import QdrantSearcher |
| 2 | +from cuda_model import CudaModel |
| 3 | +from youtube.youtube_collector import YouTubeColletor |
| 4 | +from mysql_insert import Mysql_Manager |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +from itertools import combinations |
| 8 | +import time |
| 9 | +from tqdm import tqdm |
| 10 | + |
| 11 | + |
| 12 | +pd.set_option('display.max_columns', 10) |
| 13 | +pd.set_option('display.max_rows', 10) |
| 14 | +pd.set_option('display.width', None) |
| 15 | +pd.set_option('display.max_colwidth', None) |
| 16 | + |
| 17 | +num_books_to_collect = 30 |
| 18 | +num_main_keywords = 4 |
| 19 | +# fixed_datetime = '2024-05-14 02:00:00' |
| 20 | +namnams = ['2024-05-16 02:00:00']#'2024-05-13 02:00:00', '2024-05-12 02:00:00', '2024-05-11 02:00:00', '2024-05-10 02:00:00','2024-05-09 02:00:00' |
| 21 | + |
| 22 | +def make_clu_sentence(sent, k=4): |
| 23 | + words = sent.split() |
| 24 | + |
| 25 | + # 문장에 있는 단어 수가 k 이하라면 원래 문장 반환 |
| 26 | + if len(words) <= k: |
| 27 | + return sent |
| 28 | + |
| 29 | + clu_list = [' '.join(combo) for combo in combinations(words, k)] |
| 30 | + |
| 31 | + return clu_list |
| 32 | + |
| 33 | + |
| 34 | +if __name__ == "__main__": |
| 35 | + process_start_time = time.time() |
| 36 | + |
| 37 | + now_time = time.time() |
| 38 | + cuda_model = CudaModel('CPU') |
| 39 | + print(f"사용할 디바이스: {cuda_model.device}") |
| 40 | + print(f'CUDA 로딩 시간: {time.time() - now_time}') |
| 41 | + |
| 42 | + now_time = time.time() |
| 43 | + Qsearcher = QdrantSearcher() |
| 44 | + print(f'Qdrant 로딩 시간: {time.time() - now_time}') |
| 45 | + |
| 46 | + now_time = time.time() |
| 47 | + youtubeCollector = YouTubeColletor() |
| 48 | + print(f'YouTubeColletor 로딩 시간: {time.time() - now_time}') |
| 49 | + |
| 50 | + now_time = time.time() |
| 51 | + sql_manager = Mysql_Manager() |
| 52 | + print(f'Mysql_Manager 로딩 시간: {time.time() - now_time}') |
| 53 | + |
| 54 | + youtube_category_map = {25: "NEWS", 28: "IT", 15: "ANIMAL", 10:"MUSIC", 24:"ENTERTAINMENT", 0:"NEWMEDIA"} |
| 55 | + keywords_id_map = {} |
| 56 | + |
| 57 | + auto_category = [28, 24, 25, 15, 0]#28, 24, 25, 15, 0 |
| 58 | + |
| 59 | + for fixed_datetime in namnams: |
| 60 | + # while True: |
| 61 | + for auto_c in auto_category: |
| 62 | + must_not_category = [] |
| 63 | + # search_mode = int(input('Enter mode(0: find only input sentence, 1: flexible search, 2: Youtube Popular Video): ')) |
| 64 | + search_mode = 2 |
| 65 | + |
| 66 | + if search_mode == 2: |
| 67 | + # video_category = int(input('1 : Film & Animation | 10 : Music | 15 : Pets & Animals\n' |
| 68 | + # '17 : Sports | 18 : Short Movies | 19 : Travel & Events\n' |
| 69 | + # '20 : Gaming | 24 : Entertainment | 25 : News & Politics\n' |
| 70 | + # '27 : Education | 28 : Science & Technology | 30 : Movies\n' |
| 71 | + # '36 : Drama\n' |
| 72 | + # 'Enter youtube categoty:')) |
| 73 | + video_category = auto_c |
| 74 | + print(f'Start :: {youtube_category_map[auto_c]}') |
| 75 | + |
| 76 | + |
| 77 | + # must_not_category = input('Enter to must not category').split() |
| 78 | + # print(f'제외 시킬 카테고리 : {must_not_category}') |
| 79 | + |
| 80 | + # search_sentence, video_list = youtubeCollector.get_search_keyword_by_popular_videos(video_category) |
| 81 | + if video_category == 28:#IT |
| 82 | + search_sentence, video_list = youtubeCollector.get_search_IT_video(fixed_datetime[:10]) |
| 83 | + elif video_category == 24:#ENTERTAINMENT |
| 84 | + search_sentence, video_list = youtubeCollector.get_search_ENTERTAINMENT_video(fixed_datetime[:10]) |
| 85 | + elif video_category == 25:#NEWS |
| 86 | + search_sentence, video_list = youtubeCollector.get_search_NEWS_video(fixed_datetime[:10]) |
| 87 | + elif video_category == 15:#ANIMAL |
| 88 | + search_sentence, video_list = youtubeCollector.get_search_ANIMAL_video(fixed_datetime[:10]) |
| 89 | + elif video_category == 0:#NEWMEDIA |
| 90 | + search_sentence, video_list = youtubeCollector.get_search_NEWMEDIA_video(fixed_datetime[:10]) |
| 91 | + else: |
| 92 | + search_sentence, video_list = "", [] |
| 93 | + |
| 94 | + #sql 처리 파트 |
| 95 | + keywords_id_map = {} |
| 96 | + #키워드 먼저 넣기 |
| 97 | + keywords = search_sentence.split() |
| 98 | + |
| 99 | + keyword_rank_cnt = 0 |
| 100 | + for kw in keywords: |
| 101 | + keyword_id = sql_manager.insert_keyword(kw, 1 if keyword_rank_cnt < num_main_keywords else 0, youtube_category_map[video_category], fixed_datetime) |
| 102 | + keywords_id_map[kw] = keyword_id |
| 103 | + keyword_rank_cnt+=1 |
| 104 | + |
| 105 | + #원본 영상 넣기 |
| 106 | + for video in video_list: |
| 107 | + video_id = sql_manager.insert_origin_data(video, "YOUTUBE", youtube_category_map[video_category], fixed_datetime) |
| 108 | + |
| 109 | + #원본 데이터 - 키워드 Join 테이블 채우기 |
| 110 | + for pkw in video['video_keywords']: |
| 111 | + if pkw in keywords_id_map: |
| 112 | + sql_manager.insert_trend_source(keywords_id_map[pkw], video_id, fixed_datetime) |
| 113 | + ############# |
| 114 | + else: |
| 115 | + search_sentence = input('Enter sentence(exit : ''): ') |
| 116 | + |
| 117 | + now_time = time.time() |
| 118 | + results = [] |
| 119 | + |
| 120 | + if search_sentence is not '': |
| 121 | + print(f'Start Searching {search_sentence}') |
| 122 | + if search_mode == 0: |
| 123 | + search_vector = cuda_model.model.encode(search_sentence) |
| 124 | + |
| 125 | + results = Qsearcher.search_items(search_vector, search_sentence) |
| 126 | + elif search_mode == 1 or search_mode == 2: |
| 127 | + search_part_sentences = make_clu_sentence(search_sentence) |
| 128 | + |
| 129 | + tmp_results = [] |
| 130 | + for search_part_sentence in tqdm(search_part_sentences, desc="Processing"): |
| 131 | + search_vector = cuda_model.model.encode(search_part_sentence) |
| 132 | + |
| 133 | + tmp_results.extend(Qsearcher.search_items(search_vector, search_part_sentence.split())) |
| 134 | + |
| 135 | + if len(tmp_results) > 200: |
| 136 | + tmp_results = sorted(tmp_results, key=lambda x: x['score'], reverse=True) |
| 137 | + tmp_results = tmp_results[:num_books_to_collect] |
| 138 | + |
| 139 | + tmp_results = sorted(tmp_results, key = lambda x: x['score'], reverse=True) |
| 140 | + |
| 141 | + existing_books = set([]) |
| 142 | + tmp_idx = 0 |
| 143 | + |
| 144 | + while len(results) < num_books_to_collect and tmp_idx < len(tmp_results): |
| 145 | + if tmp_results[tmp_idx]['book_name'] not in existing_books: |
| 146 | + existing_books.add(tmp_results[tmp_idx]['book_name']) |
| 147 | + results.append(tmp_results[tmp_idx]) |
| 148 | + else: |
| 149 | + pass |
| 150 | + |
| 151 | + tmp_idx += 1 |
| 152 | + else: |
| 153 | + break |
| 154 | + |
| 155 | + print(f'[{search_sentence} :: {search_mode}] 검색 시간: {time.time() - now_time}') |
| 156 | + for result in results: |
| 157 | + print(result, end="\n----------------------------\n") |
| 158 | + # sql 처리 파트 |
| 159 | + book_pk = sql_manager.insert_daily_recommend(result['book_id'], fixed_datetime) |
| 160 | + |
| 161 | + for kw in result['searched_keywords']: |
| 162 | + sql_manager.insert_recommend_keyword(book_pk, keywords_id_map[kw], fixed_datetime) |
| 163 | + |
| 164 | + print(f'전체 걸린 시간 : {time.time() - process_start_time}') |
0 commit comments