import numpy import pdfplumber import pandas as pd def extract_table_from_pdf(pdf_path, start_page, skip_header): data = [] with pdfplumber.open(pdf_path) as pdf: for i in range(start_page - 1, len(pdf.pages)): page = pdf.pages[i] tables = page.extract_table() if tables: if skip_header: del tables[:1] data.extend(tables) if data: df = pd.DataFrame(data) return df else: print("No table found in the specified range.") return None def create_gpt_training_data(hscode_bps_pdf=None, mensa_import_excel=None, mensa_custom_correction_excel=None): training_data = [] # create training data in pandas from INSW PDF files. the newest data is started from page 864 if hscode_bps_pdf is not None: data = extract_table_from_pdf(pdf_path=hscode_bps_pdf, start_page=864, skip_header=True) for _, row in data.iterrows(): training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1])) # create training data from mensa historical import data if mensa_import_excel is not None: import_hist = pd.read_excel(mensa_import_excel) for _, row in import_hist.iterrows(): training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1])) # create training data from mensa custom's correction import data if mensa_custom_correction_excel is not None: custom_correction = pd.read_excel(mensa_custom_correction_excel, usecols="B,K,L") for _, row in custom_correction.iterrows(): # if no hs code found (empty data), skip data append if not row[2] or numpy.isnan(row[2]): continue training_data.append(construct_gpt_hs_training(product_desc=row[1], hs_code=row[2])) return training_data def construct_gpt_hs_training(product_desc, hs_code): if 1 == 1: return construct_gemini_training_data(product_desc, hs_code) return ({ "messages": [ {"role": "system", "content": "You are mensa-hsai that able to determine HS Code" " from a product including product in Bahasa Indonesia"}, {"role": "user", "content": f'"What is the hs code for {product_desc}"'}, {"role": "assistant", "content": f'{{"hs_code" : "{hs_code}", "uraian": "{product_desc}" }}'}]}) def construct_gemini_training_data(product_desc, hs_code): return {"text_input": f'What is the hs code for {product_desc}', "output": f'{hs_code}'}