AutoSQL/AutoSQL.py at main · PrajwalAmte/AutoSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/env python3
"""
AutoSQL — Self-optimizing SQL Query Pipeline
─────────────────────────────────────────────
Inspired by karpathy/autoresearch.

The loop:
  LLM rewrites query → run it → measure (speed + correctness) → keep if better → repeat

Usage:
  python AutoSQL.py --query slow.sql --db mydb.sqlite
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --iterations 15
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --model llama-3.3-70b-versatile

Requirements:
  pip install groq
  export GROQ_API_KEY=gsk_...   (free at https://console.groq.com)
"""

import os
import sqlite3
import time
import json
import argparse
import hashlib
from pathlib import Path
from groq import Groq

DEFAULT_MODEL = "llama-3.3-70b-versatile"

# ─── Helpers ──────────────────────────────────────────────────────────────────

def groq_generate(model: str, prompt: str) -> str:
    """Call the Groq API (free tier) via the groq SDK."""
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        raise RuntimeError(
            "GROQ_API_KEY not set.  Get a free key → https://console.groq.com"
        )

    client = Groq(api_key=api_key)
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_completion_tokens=1024,
    )
    return resp.choices[0].message.content.strip()

def get_schema(conn: sqlite3.Connection) -> str:
    """Return a compact text representation of every table and its columns."""
    tables = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
    ).fetchall()
    parts: list[str] = []
    for (tbl,) in tables:
        cols = conn.execute(f"PRAGMA table_info({tbl})").fetchall()
        col_str = ", ".join(f"{c[1]} {c[2]}" for c in cols)
        parts.append(f"  {tbl}({col_str})")
    return "\n".join(parts)


def run_query(
    conn: sqlite3.Connection, query: str, runs: int = 3
) -> tuple[float, str | None, str | None]:
    """
    Execute `query` multiple times and return (avg_ms, result_hash, error).
    The hash is computed on the first run; timing is averaged over all runs.
    """
    times: list[float] = []
    result_hash: str | None = None

    for i in range(runs):
        try:
            t0 = time.perf_counter()
            rows = conn.execute(query).fetchall()
            elapsed_ms = (time.perf_counter() - t0) * 1_000
            times.append(elapsed_ms)
            if i == 0:
                result_hash = hashlib.md5(
                    str(sorted(rows)).encode()
                ).hexdigest()
        except Exception as exc:
            return 0.0, None, str(exc)

    return sum(times) / len(times), result_hash, None


# ─── LLM Optimizer ────────────────────────────────────────────────────────────

def build_prompt(
    schema: str,
    original_query: str,
    current_best: str,
    history: list[dict],
    iteration: int,
) -> str:
    """Build the optimisation prompt, including the last 5 attempts for context."""
    attempts = ""
    if history:
        attempts = "\n\nPrevious attempts (learn from these):\n"
        for h in history[-5:]:
            tag = "✓ correct" if h["correct"] else "✗ wrong result"
            err = f"  error: {h['error']}" if h["error"] else ""
            attempts += (
                f"  [{h['iteration']}] {h['time_ms']:.1f}ms | {h['speedup']:.2f}x speedup | {tag}{err}\n"
                f"       {h['query'][:300].strip()}\n\n"
            )

    return f"""You are an expert SQL optimizer. Rewrite the query below to run as fast as possible
while returning byte-for-byte IDENTICAL results (same rows, same order).

SCHEMA:
{schema}

ORIGINAL QUERY (baseline):
{original_query}

CURRENT BEST QUERY (iteration {iteration - 1}):
{current_best}
{attempts}
OPTIMIZATION TECHNIQUES TO CONSIDER:
  • Replace correlated subqueries with pre-aggregated JOINs or CTEs
  • Use window functions (AVG OVER, SUM OVER) instead of self-joins
  • Push WHERE filters as early as possible (before joins)
  • Avoid re-scanning large tables multiple times
  • Minimise columns projected inside subqueries

Return ONLY the raw SQL — no explanation, no markdown fences, no backticks."""


def optimize(
    model: str,
    schema: str,
    original_query: str,
    current_best: str,
    history: list[dict],
    iteration: int,
) -> str:
    """Ask the LLM for an optimised version of the query."""
    prompt = build_prompt(schema, original_query, current_best, history, iteration)
    raw = groq_generate(model, prompt)

    # Strip markdown fences the model might sneak in
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()

    return raw


# ─── Main Loop ────────────────────────────────────────────────────────────────

def autosql(
    conn: sqlite3.Connection,
    query: str,
    iterations: int = 10,
    model: str = DEFAULT_MODEL,
) -> str | None:
    schema = get_schema(conn)

    bar = "─" * 62
    print(f"\n{bar}")
    print("  AutoSQL — Self-optimizing Query Pipeline")
    print(f"  Model  : {model}  (Groq)")
    print(bar)
    print(f"\nSchema:\n{schema}\n")
    print(f"Query:\n{query}\n")
    print(bar)

    # ── Baseline ──────────────────────────────────────────────────────────────
    print("\n  Measuring baseline …", end=" ", flush=True)
    baseline_ms, baseline_hash, err = run_query(conn, query)
    if err:
        print(f"\n✗ Baseline query failed: {err}")
        return None
    print(f"{baseline_ms:.1f} ms\n")

    best_query   = query
    best_ms      = baseline_ms
    best_speedup = 1.0
    history: list[dict] = []

    # ── Optimisation loop ─────────────────────────────────────────────────────
    for i in range(1, iterations + 1):
        print(f"  [{i:02d}/{iterations:02d}] Generating … ", end="", flush=True)

        try:
            new_query = optimize(model, schema, query, best_query, history, i)
        except RuntimeError as exc:
            print(f"✗  {exc}")
            break

        new_ms, new_hash, err = run_query(conn, new_query)

        correct = (not err) and (new_hash == baseline_hash)
        speedup = (baseline_ms / new_ms) if (correct and new_ms > 0) else 0.0
        improved = correct and new_ms < best_ms

        # Verdict string
        if err:
            verdict = f"✗  error: {err[:55]}"
        elif not correct:
            verdict = "✗  wrong result"
        elif improved:
            best_query   = new_query
            best_ms      = new_ms
            best_speedup = speedup
            verdict = f"✓  {new_ms:.1f} ms  ({speedup:.2f}× faster)  ← NEW BEST"
        else:
            verdict = f"✓  {new_ms:.1f} ms  ({speedup:.2f}×)  no improvement"

        print(verdict)

        history.append({
            "iteration": i,
            "query":     new_query,
            "time_ms":   new_ms,
            "speedup":   speedup,
            "correct":   correct,
            "error":     err,
        })

    # ── Report ────────────────────────────────────────────────────────────────
    print(f"\n{bar}")
    print("  Final Report")
    print(bar)
    print(f"  Baseline : {baseline_ms:.1f} ms")
    print(f"  Best     : {best_ms:.1f} ms  ({best_speedup:.2f}× faster)\n")
    print("  Best Query:\n")
    for line in best_query.splitlines():
        print(f"    {line}")

    log = {
        "model":            model,
        "schema":           schema,
        "baseline_query":   query,
        "baseline_ms":      baseline_ms,
        "best_query":       best_query,
        "best_ms":          best_ms,
        "speedup":          best_speedup,
        "iterations":       history,
    }
    log_path = Path("autosql_log.json")
    log_path.write_text(json.dumps(log, indent=2))
    print(f"\n  Log → {log_path.resolve()}")
    print(bar + "\n")

    return best_query


# ─── CLI ──────────────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description="AutoSQL: self-optimizing SQL query pipeline (powered by Groq)",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python AutoSQL.py --query slow.sql --db app.sqlite
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --iterations 15
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --model llama-3.1-8b-instant
        """,
    )
    parser.add_argument("--query", type=str, required=True,
                        help="SQL query string or path to a .sql file")
    parser.add_argument("--db", type=str, required=True,
                        help="Path to SQLite database file")
    parser.add_argument("--iterations", type=int, default=10,
                        help="Optimisation iterations (default: 10)")
    parser.add_argument("--model", type=str, default=DEFAULT_MODEL,
                        help=f"Groq model to use (default: {DEFAULT_MODEL})")
    args = parser.parse_args()

    conn = sqlite3.connect(args.db)
    query = (
        Path(args.query).read_text()
        if args.query.endswith(".sql")
        else args.query
    )

    autosql(conn, query, iterations=args.iterations, model=args.model)


if __name__ == "__main__":
    main()