data-processing-scripts/parseCorpListJson.py at main · InvestiMate-AI/data-processing-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# from dataproc import Dart, OPENAICommunicator
# import time
# import json

# 기업 이름 리스트
corp_names = [
    "삼성전자", "SK하이닉스", "LG에너지솔루션", "현대차", "삼성바이오로직스", "기아",
    "셀트리온", "KB금융", "POSCO홀딩스", "NAVER", "삼성물산", "신한지주", "LG화학", "삼성SDI",
    "현대모비스", "포스코퓨처엠", "삼성화재", "LG전자", "카카오", "하나금융지주", "삼성생명", "한미반도체",
    # "삼성전자우",
    # "메리츠금융지주", "HMM", "HD현대중공업", "크래프톤", "SK스퀘어", "두산에너빌리티", "LG", "한화에어로스페이스",
    # "한국전력", "삼성전기", "포스코인터내셔널", "KT&G", "삼성에스디에스", "SK", "HD한국조선해양",
    # "기업은행", "HD현대일렉트릭", "SK이노베이션", "SK텔레콤", "우리금융지주", "고려아연", "아모레퍼시픽",
    # "카카오뱅크", "KT", "한화오션", "대한항공", "하이브", "삼성중공업", "현대글로비스", "DB손해보험",
    # "S-Oil", "LS ELECTRIC", "현대차2우B", "KODEX 200", "유한양행", "LG이노텍", "SKC", "에코프로머티",
    # "SK바이오팜", "HD현대", "포스코DX", "CJ제일제당", "HD현대마린솔루션", "LG디스플레이", "한국타이어앤테크놀로지",
    # "LG생활건강", "맥쿼리인프라", "두산로보틱스", "한국항공우주", "두산밥캣", "삼양식품", "금양", "엘앤에프",
    # "롯데케미칼", "LIG넥스원", "한국가스공사", "한화솔루션", "코웨이", "삼성E&A", "코스모신소재",
    # "LS", "넷마블", "현대오토에버", "현대로템", "삼성카드", "GS", "미래에셋증권", "현대차우", "한진칼",
    # "LG유플러스", "NH투자증권", "금호석유", "SK바이오사이언스", "엔씨소프트"
]

# # 사업 연도 리스트
# bsns_years = ['2023', '2024']

# # 보고서 유형 리스트
# reprt_types = ['사업보고서', '반기보고서', '1분기보고서', '3분기보고서']

# # Dart 인스턴스 생성
# d = Dart()
# communicator = OPENAICommunicator()

# # 각 기업에 대해 각 연도와 보고서 유형에 대해 보고서 다운로드 및 XML 파싱
# for corp_name in corp_names:
#     for bsns_year in bsns_years:
#         for reprt_type in reprt_types:
#             try:
#                 print(f"Downloading report for {corp_name}, {bsns_year}, {reprt_type}")
#                 d.download_report(corp_name, bsns_year, reprt_type)
#                 print(f"Parsing XML for {corp_name}, {bsns_year}, {reprt_type}")
#                 d.parse_xml(corp_name, bsns_year, reprt_type)
#                 time.sleep(1)  # 서버에 부담을 주지 않기 위해 1초 대기
#             except KeyError as e:
#                 print(f"Error downloading report for {corp_name}, {bsns_year}, {reprt_type}: {e}")
#                 continue


# results = []
# for corp_name in corp_names:
#     for bsns_year in bsns_years:
#         for reprt_type in reprt_types:
#             try:
#                 # 파일 업로드 및 어시스턴트 생성
#                 vector_store_id = communicator.create_vector_store(corp_name, bsns_year, reprt_type)
#                 file_ids = communicator.upload_files(corp_name, bsns_year, reprt_type, vector_store_id)
#                 assistant_id = communicator.create_assistant(corp_name, bsns_year, reprt_type, vector_store_id)

#                 # 결과 저장
#                 result = {
#                     "corp_name": corp_name,
#                     "report_type": reprt_type,
#                     "year": bsns_year,
#                     "assistant_id": assistant_id
#                 }
#                 results.append(result)
#                 print(result)

#                 # 서버에 부담을 주지 않기 위해 1초 대기
#                 time.sleep(1)

#             except Exception as e:
#                 print(f"Error processing {corp_name}, {bsns_year}, {reprt_type}: {e}")
#                 continue

# # 결과를 JSON 파일로 저장
# with open('results.json', 'w', encoding='utf-8') as f:
#     json.dump(results, f, ensure_ascii=False, indent=4)

# print("Processing completed.")

import json

# Read the JSON file
with open('C:/Users/kangs/Projects/Capstone/results.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Create the base of the SQL query
sql_query = "INSERT INTO report (report_year, assistant_id, report_company, report_type) VALUES "

# Iterate over the data and create values for each entry
values = []
for entry in data:
    values.append(
        f"({entry['year']}, '{entry['assistant_id']}', '{entry['corp_name']}', '{entry['report_type']}')"
    )

# Combine all values into the SQL query
sql_query += ", ".join(values) + ";"

# Output the SQL query
print(sql_query)