initial commit
This commit is contained in:
64
insw_scrapper.py
Normal file
64
insw_scrapper.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import numpy
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def extract_table_from_pdf(pdf_path, start_page, skip_header):
|
||||
data = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for i in range(start_page - 1, len(pdf.pages)):
|
||||
page = pdf.pages[i]
|
||||
tables = page.extract_table()
|
||||
if tables:
|
||||
if skip_header:
|
||||
del tables[:1]
|
||||
data.extend(tables)
|
||||
if data:
|
||||
df = pd.DataFrame(data)
|
||||
return df
|
||||
else:
|
||||
print("No table found in the specified range.")
|
||||
return None
|
||||
|
||||
|
||||
def create_gpt_training_data(hscode_bps_pdf=None, mensa_import_excel=None, mensa_custom_correction_excel=None):
|
||||
training_data = []
|
||||
# create training data in pandas from INSW PDF files. the newest data is started from page 864
|
||||
if hscode_bps_pdf is not None:
|
||||
data = extract_table_from_pdf(pdf_path=hscode_bps_pdf, start_page=864, skip_header=True)
|
||||
|
||||
for _, row in data.iterrows():
|
||||
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
||||
|
||||
# create training data from mensa historical import data
|
||||
if mensa_import_excel is not None:
|
||||
import_hist = pd.read_excel(mensa_import_excel)
|
||||
for _, row in import_hist.iterrows():
|
||||
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
||||
|
||||
# create training data from mensa custom's correction import data
|
||||
if mensa_custom_correction_excel is not None:
|
||||
custom_correction = pd.read_excel(mensa_custom_correction_excel, usecols="B,K,L")
|
||||
for _, row in custom_correction.iterrows():
|
||||
# if no hs code found (empty data), skip data append
|
||||
if not row[2] or numpy.isnan(row[2]):
|
||||
continue
|
||||
training_data.append(construct_gpt_hs_training(product_desc=row[1], hs_code=row[2]))
|
||||
|
||||
return training_data
|
||||
|
||||
|
||||
def construct_gpt_hs_training(product_desc, hs_code):
|
||||
if 1 == 1:
|
||||
return construct_gemini_training_data(product_desc, hs_code)
|
||||
|
||||
return ({
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are mensa-hsai that able to determine HS Code"
|
||||
" from a product including product in Bahasa Indonesia"},
|
||||
{"role": "user", "content": f'"What is the hs code for {product_desc}"'},
|
||||
{"role": "assistant", "content": f'{{"hs_code" : "{hs_code}", "uraian": "{product_desc}" }}'}]})
|
||||
|
||||
|
||||
def construct_gemini_training_data(product_desc, hs_code):
|
||||
return {"text_input": f'What is the hs code for {product_desc}', "output": f'{hs_code}'}
|
||||
Reference in New Issue
Block a user