Files
mensa-gpt-hscode/insw_scrapper.py
2025-02-19 11:17:17 +08:00

65 lines
2.6 KiB
Python

import numpy
import pdfplumber
import pandas as pd
def extract_table_from_pdf(pdf_path, start_page, skip_header):
data = []
with pdfplumber.open(pdf_path) as pdf:
for i in range(start_page - 1, len(pdf.pages)):
page = pdf.pages[i]
tables = page.extract_table()
if tables:
if skip_header:
del tables[:1]
data.extend(tables)
if data:
df = pd.DataFrame(data)
return df
else:
print("No table found in the specified range.")
return None
def create_gpt_training_data(hscode_bps_pdf=None, mensa_import_excel=None, mensa_custom_correction_excel=None):
training_data = []
# create training data in pandas from INSW PDF files. the newest data is started from page 864
if hscode_bps_pdf is not None:
data = extract_table_from_pdf(pdf_path=hscode_bps_pdf, start_page=864, skip_header=True)
for _, row in data.iterrows():
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
# create training data from mensa historical import data
if mensa_import_excel is not None:
import_hist = pd.read_excel(mensa_import_excel)
for _, row in import_hist.iterrows():
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
# create training data from mensa custom's correction import data
if mensa_custom_correction_excel is not None:
custom_correction = pd.read_excel(mensa_custom_correction_excel, usecols="B,K,L")
for _, row in custom_correction.iterrows():
# if no hs code found (empty data), skip data append
if not row[2] or numpy.isnan(row[2]):
continue
training_data.append(construct_gpt_hs_training(product_desc=row[1], hs_code=row[2]))
return training_data
def construct_gpt_hs_training(product_desc, hs_code):
if 1 == 1:
return construct_gemini_training_data(product_desc, hs_code)
return ({
"messages": [
{"role": "system", "content": "You are mensa-hsai that able to determine HS Code"
" from a product including product in Bahasa Indonesia"},
{"role": "user", "content": f'"What is the hs code for {product_desc}"'},
{"role": "assistant", "content": f'{{"hs_code" : "{hs_code}", "uraian": "{product_desc}" }}'}]})
def construct_gemini_training_data(product_desc, hs_code):
return {"text_input": f'What is the hs code for {product_desc}', "output": f'{hs_code}'}