initial commit
This commit is contained in:
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
8
.idea/mensa-gpt.iml
generated
Normal file
8
.idea/mensa-gpt.iml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
4
.idea/misc.xml
generated
Normal file
4
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (mensa-gpt)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/mensa-gpt.iml" filepath="$PROJECT_DIR$/.idea/mensa-gpt.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
BIN
HSCodeMasterBPS.pdf
Normal file
BIN
HSCodeMasterBPS.pdf
Normal file
Binary file not shown.
BIN
MensaResumeHS.xlsx
Normal file
BIN
MensaResumeHS.xlsx
Normal file
Binary file not shown.
48
README.md
Normal file
48
README.md
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# Cara Running GPT Training
|
||||||
|
1. Siapkan file HSCodeMasterBPS.pdf dari https://www.bps.go.id/assets/docs/HSCode%20Master%20BPS.pdf
|
||||||
|
2. Siapkan file seluruh list import dari oracle sesuai format tblPibDtl.xlsx
|
||||||
|
3. Siapkan dokumen koreksi hs code dari bea cukai sesuai dengan format MensaResumeHs.xlsx
|
||||||
|
4. Applikasi hanya bisa dijalankan di linux, dikarenakan adanya library python
|
||||||
|
yang tidak bisa dipkaai di windows
|
||||||
|
5. Install Anaconda / Mini Conda. URL ada di https://www.anaconda.com/download/success
|
||||||
|
6. Buat virtual environment di conda dengan menjalankan command berikut
|
||||||
|
```shell
|
||||||
|
conda create -f environment.yml
|
||||||
|
```
|
||||||
|
4. Aktivasi conda virtual environment
|
||||||
|
```shell
|
||||||
|
conda activate mensa-gpt
|
||||||
|
```
|
||||||
|
5. Untuk memastikan library telah terdownload jalankan conda environment update
|
||||||
|
```shell
|
||||||
|
conda env update --file environment.yml
|
||||||
|
```
|
||||||
|
6. Update open api key di dalam gpt_training, serta update gpt model apabila diperlukan
|
||||||
|
7. Jalankan main.py dengan python
|
||||||
|
```shell
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Konfigurasi tambahan
|
||||||
|
### Konfigurasi File HS Code BPS
|
||||||
|
1. Apabila ada perubahan halaman HS Code terbaru, silahkan update file `insw_scarapper.py` pada line 28.
|
||||||
|
Rubah nilai `start_page` dengan nilai yang sesuai
|
||||||
|
2. Nama file PDF serta path dari file tersebut dapat di sesuaikan di main.py pada pemanggilan fungsi `create_gpt_training_data` di parameter `hscode_bps_pdf`
|
||||||
|
|
||||||
|
### Konfigurasi Dokumen Import Mensa
|
||||||
|
1. Sistem hanya menerima format file xlsx
|
||||||
|
2. Nama file excel serta path dari file tersebut dapat di sesuaikan di main.py pada pemanggilan fungsi
|
||||||
|
`create_gpt_training_data` di parameter `mensa_import_excel`
|
||||||
|
3. Usahakan menggunakan format yang sama persis seperti contoh. Apabila ada perubahan, penyesuaian dapat dilakukan di
|
||||||
|
file `insw_scarapper.py` pada line 37, dimana kolom A pada excel terdapat pada array index ke 0
|
||||||
|
|
||||||
|
### Konfigurasi Dokumen koreksi hs code dari bea cukai
|
||||||
|
1. Sistem hanya menerima format file xlsx
|
||||||
|
2. Nama file excel serta path dari file tersebut dapat di sesuaikan di main.py pada pemanggilan fungsi
|
||||||
|
`create_gpt_training_data` di parameter `mensa_custom_correction_excel`
|
||||||
|
3. 3. Usahakan menggunakan format yang sama persis seperti contoh. Apabila ada perubahan, penyesuaian dapat dilakukan di
|
||||||
|
file `insw_scarapper.py` pada line 46 dan 44, dimana kolom A pada excel terdapat pada array index ke 0
|
||||||
|
4. line 44 digunakan untuk mengecek apakah hs code dari row tersebut kosong, sebagai indikator empty row
|
||||||
|
5. parameter usecols pada line 41 digunakan untuk memotong data yang terlalu besar dengan cara mengambil
|
||||||
|
data dari kolom tertentu saja
|
||||||
BIN
__pycache__/gemini_trainer.cpython-310.pyc
Normal file
BIN
__pycache__/gemini_trainer.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/gpt_trainer.cpython-310.pyc
Normal file
BIN
__pycache__/gpt_trainer.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/insw_scrapper.cpython-310.pyc
Normal file
BIN
__pycache__/insw_scrapper.cpython-310.pyc
Normal file
Binary file not shown.
8
environment.yml
Normal file
8
environment.yml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
name: mensa-gpt
|
||||||
|
dependencies:
|
||||||
|
- pip:
|
||||||
|
- pdfplumber
|
||||||
|
- pandas
|
||||||
|
- openpyxl
|
||||||
|
- openai
|
||||||
|
- scikit-learn
|
||||||
17
gemini_trainer.py
Normal file
17
gemini_trainer.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import json
|
||||||
|
import csv
|
||||||
|
GEMINI_API_KEY = "AIzaSyCpLX-Ps4U9BxrKPL22bL_SAX1S30QF-Xg"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_gemini_training_file(data):
|
||||||
|
training_temp_filename = "training_gemini.csv"
|
||||||
|
|
||||||
|
# create files that contain training data in jsonl format
|
||||||
|
with open(training_temp_filename, 'w') as file:
|
||||||
|
fieldnames = ['text_input', 'output']
|
||||||
|
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
38
gpt_trainer.py
Normal file
38
gpt_trainer.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import json
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# we can use gpt-4o-mini , gpt-4o or gpt-3.5-turbo . right now we use mini for price reason
|
||||||
|
gpt_model = "gpt-4o-mini"
|
||||||
|
openai_api_key = "apikey"
|
||||||
|
|
||||||
|
fine_tune_modelname = "mensa-hsai"
|
||||||
|
|
||||||
|
|
||||||
|
def train_gpt(data):
|
||||||
|
client = OpenAI(api_key=openai_api_key)
|
||||||
|
training_temp_filename = "training.jsonl"
|
||||||
|
|
||||||
|
# create files that contain training data in jsonl format
|
||||||
|
with open(training_temp_filename, 'w') as file:
|
||||||
|
for entry in data:
|
||||||
|
json.dump(entry, file)
|
||||||
|
file.write('\n')
|
||||||
|
|
||||||
|
# upload file training to gpt
|
||||||
|
training_file = client.files.create(
|
||||||
|
file=open(training_temp_filename, 'rb'), purpose='fine-tune')
|
||||||
|
print("Training file id:", training_file.id)
|
||||||
|
|
||||||
|
# create job to start training
|
||||||
|
response = client.fine_tuning.jobs.create(
|
||||||
|
training_file=training_file.id,
|
||||||
|
model=gpt_model,
|
||||||
|
suffix=fine_tune_modelname
|
||||||
|
)
|
||||||
|
|
||||||
|
print("GPT Fine Tune job id:", response.id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
64
insw_scrapper.py
Normal file
64
insw_scrapper.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import numpy
|
||||||
|
import pdfplumber
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def extract_table_from_pdf(pdf_path, start_page, skip_header):
|
||||||
|
data = []
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for i in range(start_page - 1, len(pdf.pages)):
|
||||||
|
page = pdf.pages[i]
|
||||||
|
tables = page.extract_table()
|
||||||
|
if tables:
|
||||||
|
if skip_header:
|
||||||
|
del tables[:1]
|
||||||
|
data.extend(tables)
|
||||||
|
if data:
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
return df
|
||||||
|
else:
|
||||||
|
print("No table found in the specified range.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def create_gpt_training_data(hscode_bps_pdf=None, mensa_import_excel=None, mensa_custom_correction_excel=None):
|
||||||
|
training_data = []
|
||||||
|
# create training data in pandas from INSW PDF files. the newest data is started from page 864
|
||||||
|
if hscode_bps_pdf is not None:
|
||||||
|
data = extract_table_from_pdf(pdf_path=hscode_bps_pdf, start_page=864, skip_header=True)
|
||||||
|
|
||||||
|
for _, row in data.iterrows():
|
||||||
|
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
||||||
|
|
||||||
|
# create training data from mensa historical import data
|
||||||
|
if mensa_import_excel is not None:
|
||||||
|
import_hist = pd.read_excel(mensa_import_excel)
|
||||||
|
for _, row in import_hist.iterrows():
|
||||||
|
training_data.append(construct_gpt_hs_training(product_desc=row[2], hs_code=row[1]))
|
||||||
|
|
||||||
|
# create training data from mensa custom's correction import data
|
||||||
|
if mensa_custom_correction_excel is not None:
|
||||||
|
custom_correction = pd.read_excel(mensa_custom_correction_excel, usecols="B,K,L")
|
||||||
|
for _, row in custom_correction.iterrows():
|
||||||
|
# if no hs code found (empty data), skip data append
|
||||||
|
if not row[2] or numpy.isnan(row[2]):
|
||||||
|
continue
|
||||||
|
training_data.append(construct_gpt_hs_training(product_desc=row[1], hs_code=row[2]))
|
||||||
|
|
||||||
|
return training_data
|
||||||
|
|
||||||
|
|
||||||
|
def construct_gpt_hs_training(product_desc, hs_code):
|
||||||
|
if 1 == 1:
|
||||||
|
return construct_gemini_training_data(product_desc, hs_code)
|
||||||
|
|
||||||
|
return ({
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are mensa-hsai that able to determine HS Code"
|
||||||
|
" from a product including product in Bahasa Indonesia"},
|
||||||
|
{"role": "user", "content": f'"What is the hs code for {product_desc}"'},
|
||||||
|
{"role": "assistant", "content": f'{{"hs_code" : "{hs_code}", "uraian": "{product_desc}" }}'}]})
|
||||||
|
|
||||||
|
|
||||||
|
def construct_gemini_training_data(product_desc, hs_code):
|
||||||
|
return {"text_input": f'What is the hs code for {product_desc}', "output": f'{hs_code}'}
|
||||||
14
main.py
Normal file
14
main.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
from insw_scrapper import create_gpt_training_data
|
||||||
|
from gemini_trainer import generate_gemini_training_file
|
||||||
|
from gpt_trainer import train_gpt
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# extract_table_from_pdf(pdf_path="HSCodeMasterBPS.pdf", start_page=864, skip_header=True)
|
||||||
|
training_data = create_gpt_training_data( # hscode_bps_pdf="HSCodeMasterBPS.pdf",
|
||||||
|
mensa_import_excel="tblPibDtl.xlsx",
|
||||||
|
mensa_custom_correction_excel="MensaResumeHS.xlsx")
|
||||||
|
# train_gpt(data=training_data)
|
||||||
|
generate_gemini_training_file(training_data)
|
||||||
|
|
||||||
|
|
||||||
|
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|
||||||
BIN
mensa-gpt.zip
Normal file
BIN
mensa-gpt.zip
Normal file
Binary file not shown.
BIN
tblPibDtl.xlsx
Normal file
BIN
tblPibDtl.xlsx
Normal file
Binary file not shown.
38179
training.jsonl
Normal file
38179
training.jsonl
Normal file
File diff suppressed because it is too large
Load Diff
38183
training_gemini.csv
Normal file
38183
training_gemini.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
training_gemini.jsonl
Normal file
1
training_gemini.jsonl
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user