{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 20.8M 100 20.8M 0 0 25.2M 0 --:--:-- --:--:-- --:--:-- 25.3M\n" ] } ], "source": [ "RAW_DATA = \"https://s3.iwanhae.kr/kuberian/1_prod_kubernetes_v1_27_4.jsonl.gz\"\n", "RAW_DATA_NAME = \"data.jsonl\"\n", "DB_NAME = \"kuberian.db\"\n", "!curl \"{RAW_DATA}\" -o \"{RAW_DATA_NAME}.gz\"\n", "!rm \"{RAW_DATA_NAME}\" \"{DB_NAME}\"\n", "!gunzip \"{RAW_DATA_NAME}.gz\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "tables = [\n", " \"\"\"\n", "CREATE TABLE functions (\n", "\tid INTEGER PRIMARY KEY NOT NULL,\n", " \tname TEXT NOT NULL,\n", " signature TEXT NOT NULL,\n", "\tfile TEXT NOT NULL,\n", "\tcode TEXT NOT NULL,\n", " line_start INTEGER NOT NULL,\n", " line_end INTEGER NOT NULL\n", ");\"\"\",\n", " \"\"\"\n", "CREATE TABLE function_analyses (\n", "\tid INTEGER PRIMARY KEY NOT NULL,\n", " function_id INTEGER NOT NULL,\n", " summary TEXT NOT NULL,\n", " background TEXT,\n", "\tanalysis TEXT,\n", "\tpurpose TEXT,\n", " comment TEXT,\n", " tldr TEXT\n", ");\n", "\"\"\"]\n", "indexes = [\n", " \"CREATE INDEX idx_function_analyses_function_id ON function_analyses (function_id);\"\n", "]\n", "\n", "conn = sqlite3.connect(DB_NAME)\n", "cur = conn.cursor()\n", "for table in tables:\n", " cur.execute(table)\n", "conn.commit()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "sql_function = \"\"\"\n", "INSERT INTO functions(id,name,signature,file,code,line_start,line_end)\n", " VALUES(?,?,?,?,?,?,?);\n", "\"\"\"\n", "\n", "sql_function_analysis = \"\"\"\n", "INSERT INTO function_analyses(function_id, summary, background, analysis, purpose, comment, tldr)\n", " VALUES(?,?,?,?,?,?,?);\n", "\"\"\"\n", "\n", "data = []\n", "with open(\"data.jsonl\") as f:\n", " for line in f:\n", " d = json.loads(line)\n", " cur.execute(sql_function, (\n", " d['id'], d['name'], d['signature'], d['file'], d['code'], d['line']['from'], d['line']['to']\n", " ))\n", " if \"parsed\" in d:\n", " parsed = d['parsed']\n", " summary = d['result'].split(\n", " \"In one sentence, this is a function that\")[1].strip()\n", " cur.execute(sql_function_analysis, (d['id'],\n", " summary,\n", " parsed['background'] if \"background\" in parsed else None,\n", " parsed['analysis'] if \"analysis\" in parsed else None,\n", " parsed['purpose'] if \"purpose\" in parsed else None,\n", " parsed['comment'] if \"comment\" in parsed else None,\n", " parsed['tldr'] if \"tldr\" in parsed else None,\n", " ))\n", "conn.commit()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "for index in indexes:\n", " cur.execute(index)\n", "conn.commit()\n", "conn.close()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }