65 lines
2.6 KiB
Python
65 lines
2.6 KiB
Python
import numpy
|
|
import pdfplumber
|
|
import pandas as pd
|
|
|
|
|
|
def extract_table_from_pdf(pdf_path, start_page, skip_header):
|
|
data = []
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for i in range(start_page - 1, len(pdf.pages)):
|
|
page = pdf.pages[i]
|
|
tables = page.extract_table()
|
|
if tables:
|
|
if skip_header:
|
|
del tables[:1]
|
|
data.extend(tables)
|
|
if data:
|
|
df = pd.DataFrame(data)
|
|
return df
|
|
else:
|
|
print("No table found in the specified range.")
|
|
return None
|
|
|
|
|
|
def create_gpt_training_data(hscode_bps_pdf=None, mensa_import_excel=None, mensa_custom_correction_excel=None):
|
|
training_data = []
|
|
# create training data in pandas from INSW PDF files. the newest data is started from page 864
|
|
if hscode_bps_pdf is not None:
|
|
data = extract_table_from_pdf(pdf_path=hscode_bps_pdf, start_page=864, skip_header=True)
|
|
|
|
for _, row in data.iterrows():
|
|
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
|
|
|
# create training data from mensa historical import data
|
|
if mensa_import_excel is not None:
|
|
import_hist = pd.read_excel(mensa_import_excel)
|
|
for _, row in import_hist.iterrows():
|
|
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
|
|
|
# create training data from mensa custom's correction import data
|
|
if mensa_custom_correction_excel is not None:
|
|
custom_correction = pd.read_excel(mensa_custom_correction_excel, usecols="B,K,L")
|
|
for _, row in custom_correction.iterrows():
|
|
# if no hs code found (empty data), skip data append
|
|
if not row[2] or numpy.isnan(row[2]):
|
|
continue
|
|
training_data.append(construct_gpt_hs_training(product_desc=row[1], hs_code=row[2]))
|
|
|
|
return training_data
|
|
|
|
|
|
def construct_gpt_hs_training(product_desc, hs_code):
|
|
if 1 == 1:
|
|
return construct_gemini_training_data(product_desc, hs_code)
|
|
|
|
return ({
|
|
"messages": [
|
|
{"role": "system", "content": "You are mensa-hsai that able to determine HS Code"
|
|
" from a product including product in Bahasa Indonesia"},
|
|
{"role": "user", "content": f'"What is the hs code for {product_desc}"'},
|
|
{"role": "assistant", "content": f'{{"hs_code" : "{hs_code}", "uraian": "{product_desc}" }}'}]})
|
|
|
|
|
|
def construct_gemini_training_data(product_desc, hs_code):
|
|
return {"text_input": f'What is the hs code for {product_desc}', "output": f'{hs_code}'}
|