In [1]:
RAW_DATA = "https://s3.iwanhae.kr/kuberian/1_prod_kubernetes_v1_27_4.jsonl.gz"
RAW_DATA_NAME = "data.jsonl"
DB_NAME = "kuberian.db"
!curl "{RAW_DATA}" -o "{RAW_DATA_NAME}.gz"
!rm "{RAW_DATA_NAME}" "{DB_NAME}"
!gunzip "{RAW_DATA_NAME}.gz"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.8M  100 20.8M    0     0  10.6M      0  0:00:01  0:00:01 --:--:-- 10.6M


In [2]:
import sqlite3
tables = [
    """
CREATE TABLE functions (
	id INTEGER PRIMARY KEY,
   	name TEXT NOT NULL,
    signature TEXT NOT NULL,
	file TEXT NOT NULL,
	code TEXT NOT NULL,
    line_start INTEGER NOT NULL,
    line_end INTEGER NOT NULL
);""",
    """
CREATE TABLE function_analyses (
	id INTEGER PRIMARY KEY,
    function_id INTEGER,
    summary TEXT NOT NULL,
    background TEXT,
	analysis TEXT,
	purpose TEXT,
    comment TEXT,
    tldr TEXT
);
"""]
indexes = [
    "CREATE INDEX idx_function_analyses_function_id ON function_analyses (function_id);"
]

conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
for table in tables:
    cur.execute(table)
conn.commit()

In [3]:
import json

sql_function = """
INSERT INTO functions(id,name,signature,file,code,line_start,line_end)
    VALUES(?,?,?,?,?,?,?);
"""

sql_function_analysis = """
INSERT INTO function_analyses(function_id, summary, background, analysis, purpose, comment, tldr)
    VALUES(?,?,?,?,?,?,?);
"""

data = []
with open("data.jsonl") as f:
    for line in f:
        d = json.loads(line)
        cur.execute(sql_function, (
            d['id'], d['name'], d['signature'], d['file'], d['code'], d['line']['from'], d['line']['to']
        ))
        if "parsed" in d:
            parsed = d['parsed']
            summary = d['result'].split(
                "In one sentence, this is a function that")[1].strip()
            cur.execute(sql_function_analysis, (d['id'],
                        summary,
                        parsed['background'] if "background" in parsed else None,
                        parsed['analysis'] if "analysis" in parsed else None,
                        parsed['purpose'] if "purpose" in parsed else None,
                        parsed['comment'] if "comment" in parsed else None,
                        parsed['tldr'] if "tldr" in parsed else None,
                        ))
conn.commit()

In [4]:
for index in indexes:
    cur.execute(index)
conn.commit()
conn.close()