/* global React, ReactDOM, parseModelResponse, formatThink, highlightSQL, DAGView, ResultTable, Icon, Step, useTweaks, TweaksPanel, TweakSection, TweakRadio, TweakToggle, lucide */
const { useState, useEffect, useMemo, useRef, useCallback } = React;
// ─── Backend API ──────────────────────────────────────────────
const API_BASE = ''; // same origin (FastAPI serves UI)
async function apiFetchCases() {
const r = await fetch(`${API_BASE}/api/cases`);
if (!r.ok) throw new Error(`/api/cases failed: ${r.status}`);
return r.json();
}
async function apiRunLive(caseId, model = 'both') {
const r = await fetch(`${API_BASE}/api/run`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ case_id: caseId, model }),
});
if (!r.ok) {
const t = await r.text();
throw new Error(`run failed (${r.status}): ${t}`);
}
return r.json();
}
// ─── Stage states for run orchestration ───────────────────────
function useRunOrchestrator() {
const [stage, setStage] = useState('idle');
const [helixStage, setHelixStage] = useState('idle');
const [sonnetStage, setSonnetStage] = useState('idle');
const timersRef = useRef([]);
const stop = useCallback(() => {
timersRef.current.forEach(clearTimeout);
timersRef.current = [];
}, []);
const reset = useCallback(() => {
stop();
setStage('idle');
setHelixStage('idle');
setSonnetStage('idle');
}, [stop]);
const start = useCallback((helixLatency, sonnetLatency, speedMul = 1) => {
stop();
// Faster pacing — was 4.2-8.5s, now ~2.0-4.0s
const playH = Math.min(4.0, Math.max(2.0, helixLatency / 8)) * speedMul;
const playS = Math.min(3.4, Math.max(1.8, sonnetLatency / 8)) * speedMul;
const seq = (durSec, setter) => {
const ms = durSec * 1000;
const t = (frac, val) => timersRef.current.push(setTimeout(() => setter(val), frac * ms));
setter('reasoning');
t(0.42, 'planning');
t(0.60, 'coding');
t(0.85, 'executing');
t(1.0, 'done');
};
setStage('running');
seq(playH, setHelixStage);
seq(playS, setSonnetStage);
const max = Math.max(playH, playS) * 1000 + 100;
timersRef.current.push(setTimeout(() => setStage('done'), max));
}, [stop]);
const finishImmediately = useCallback(() => {
stop();
setHelixStage('done');
setSonnetStage('done');
setStage('done');
}, [stop]);
return { stage, helixStage, sonnetStage, start, reset, stop, finishImmediately };
}
function stepStateFromStage(runStage, stepIdx) {
const map = { idle: -1, reasoning: 0, planning: 1, coding: 2, executing: 3, done: 4 };
const cur = map[runStage] ?? -1;
if (cur === -1) return 'idle';
if (stepIdx < cur) return 'done';
if (stepIdx === cur) return 'active';
return 'pending';
}
// ─── Pane: per-model output ───────────────────────────────────
function ModelPane({ label, badgeKind, model, parsed, runStage, isWinner, openSteps, toggleStep, latencyShown }) {
const stateFor = (i) => stepStateFromStage(runStage, i);
const finished = runStage === 'done';
const correct = model.correct;
const verdictText = !finished ? null : (correct ? 'Match' : 'Wrong');
return (
{label}
{badgeKind === 'fine-tune' ? 'Fine-tune' : 'Baseline'}
{runStage === 'idle' && Idle }
{(runStage !== 'idle' && runStage !== 'done') && (
<>
{runStage === 'reasoning' ? 'Reasoning' : runStage === 'planning' ? 'Planning' : runStage === 'coding' ? 'Generating SQL' : 'Executing'}
>
)}
{runStage === 'done' && (
<>
{correct
?
: }
{verdictText} · {latencyShown}s
>
)}
toggleStep(0)}>
{parsed.think
? {formatThink(parsed.think)}
:
— model skipped this block —
}
toggleStep(1)}>
{parsed.dag
?
: DAG block missing or unparseable.
}
toggleStep(2)}>
{highlightSQL(parsed.code)}
toggleStep(3)}>
{!finished
? Awaiting completion…
: (
<>
{correct ? 'Matches gold result' : 'Differs from gold'}
set-equal · {model.rows?.length ?? 0} rows
{model.exec_error && (
SQL error: {model.exec_error}
)}
>
)}
);
}
// ─── Hero ───────────────────────────────────────────────────
function Hero() {
return (
Helix CodeDAG · Live model showcase
Plain English in. Working data pipelines out.
Ask a database question in plain English. Helix returns
three things :
the reasoning, a typed pipeline DAG, and CTE SQL ready to drop into DBT, Dagster, or Foundry.
Below, run our 32B specialist side by side with Sonnet 4.6 — real questions,
real SQLite, set-based row comparison. No mocks.
Helix · SFT (deployed)
50.1% EX +9.0
What you're running in this demo. A 32B specialist on a couple of H100s, on-prem.
Sonnet 4.6 · CTE+DAG
41.1% EX
Frontier baseline, zero-shot, same prompt.
Helix · +RL stage
53.3% EX +12.2
Same model after reinforcement learning with execution rewards.
Benchmark · 1,432 BIRD-dev samples · 11 SQLite databases · set-based row comparison.
);
}
function SchemaBlock({ ddl }) {
const [open, setOpen] = useState(false);
const tableCount = (ddl.match(/CREATE TABLE/gi) || []).length;
return (
setOpen(o => !o)}>
Schema
{tableCount} tables · click to {open ? 'hide' : 'inspect'}
{open &&
{ddl} }
);
}
function VerdictCard({ caseData }) {
const helixOK = caseData.helix.correct;
const sonnetOK = caseData.sonnet.correct;
let summary;
if (helixOK && !sonnetOK) summary = <>Helix got it right on a question Sonnet missed.>;
else if (!helixOK && sonnetOK) summary = <>Sonnet won this round. We keep the failing cases honest.>;
else if (helixOK && sonnetOK) summary = <>Both correct . Both pipelines match the gold rows.>;
else summary = <>Both missed this one — uncommon edge case.>;
return (
{summary}
Helix · Result
{helixOK ? '✓' : '✗'}
{caseData.helix.latency_s}s · {caseData.helix.rows?.length ?? 0} rows
Sonnet · Result
{sonnetOK ? '✓' : '✗'}
{caseData.sonnet.latency_s}s · {caseData.sonnet.rows?.length ?? 0} rows
);
}
// ─── Main App ───────────────────────────────────────────────
function App() {
const [tweaks, setTweak] = useTweaks(/*EDITMODE-BEGIN*/{
"speed": "normal",
"showSchema": true,
"openAllSteps": false,
"liveApi": false,
"theme": "light"
}/*EDITMODE-END*/);
// sync html[data-theme] whenever the tweak changes
useEffect(() => {
document.documentElement.setAttribute('data-theme', tweaks.theme === 'dark' ? 'dark' : 'light');
}, [tweaks.theme]);
const [cases, setCases] = useState(null);
const [loadError, setLoadError] = useState(null);
const [caseIdx, setCaseIdx] = useState(0); // default to the first ⭐ Helix-wins case
const [runError, setRunError] = useState(null);
const [runningLive, setRunningLive] = useState(false);
const orch = useRunOrchestrator();
const { helixStage, sonnetStage, stage, start, reset, finishImmediately } = orch;
const initialOpen = [true, true, true, true];
const [helixOpen, setHelixOpen] = useState(initialOpen);
const [sonnetOpen, setSonnetOpen] = useState(initialOpen);
const toggleH = (i) => setHelixOpen(s => s.map((v, j) => j === i ? !v : v));
const toggleS = (i) => setSonnetOpen(s => s.map((v, j) => j === i ? !v : v));
// initial load
useEffect(() => {
apiFetchCases()
.then(setCases)
.catch(e => setLoadError(e.message));
}, []);
// reflow lucide icons whenever DOM updates (must run on every render — no early-return above)
useEffect(() => { if (window.lucide) window.lucide.createIcons(); });
if (loadError) {
return Failed to load cases: {loadError}
;
}
if (!cases) {
return Loading…
;
}
const caseData = cases[caseIdx];
const helixParsed = { think: caseData.helix.think, dag: caseData.helix.dag, code: caseData.helix.code };
const sonnetParsed = { think: caseData.sonnet.think, dag: caseData.sonnet.dag, code: caseData.sonnet.code };
const switchCase = (i) => {
setCaseIdx(i);
setRunError(null);
reset();
setHelixOpen(initialOpen);
setSonnetOpen(initialOpen);
};
const speedMul = tweaks.speed === 'fast' ? 0.5 : tweaks.speed === 'slow' ? 1.6 : 1;
const handleRun = async () => {
if (stage !== 'idle') {
reset();
setRunError(null);
return;
}
setRunError(null);
if (tweaks.liveApi) {
setRunningLive(true);
start(caseData.helix.latency_s || 12, caseData.sonnet.latency_s || 9, speedMul);
try {
const fresh = await apiRunLive(caseData.id, 'both');
const next = [...cases];
next[caseIdx] = fresh;
setCases(next);
finishImmediately();
} catch (e) {
setRunError(e.message);
reset();
} finally {
setRunningLive(false);
}
} else {
start(caseData.helix.latency_s, caseData.sonnet.latency_s, speedMul);
}
};
const winner = stage === 'done' ?
(caseData.helix.correct && !caseData.sonnet.correct ? 'helix'
: caseData.sonnet.correct && !caseData.helix.correct ? 'sonnet' : null)
: null;
return (
<>
Helix · CodeDAG
{tweaks.liveApi && LIVE }
01 · Pick a question
Four real questions, three real databases.
Each case is a natural-language question against a real SQLite database.
The two starred ⭐ cases are where Helix gets it right and Sonnet doesn't.
On the other two, both models land the gold rows.
Live model: helix-codedag-sft-32b-v0 · SFT release at 50.1% EX .
{cases.map((c, i) => {
const star = c.highlight && c.highlight.startsWith('⭐');
return (
switchCase(i)}>
{c.id}
{star && Helix wins }
{c.name}
);
})}
Selected · {caseData.id}
{caseData.query}
{caseData.db_id}.sqlite
expects {caseData.gold_row_count} {caseData.gold_row_count === 1 ? 'row' : 'rows'}
{caseData.evidence && hint provided }
{caseData.evidence && (
Hint: {caseData.evidence}
)}
{runError && (
{runError}
)}
{stage === 'idle' && <> {tweaks.liveApi ? 'Run live on Foundry' : 'Replay cached run'}>}
{stage === 'running' && <> Running…>}
{stage === 'done' && <> Reset & re-run>}
{tweaks.showSchema && }
02 · Live comparison
Same prompt. Same database. Two models.
{stage === 'idle' ? (
Pick a question above, then hit Run .
Both models will generate a pipeline DAG, write CTE SQL, and execute against the real database side by side.
) : (
<>
{stage === 'done' && }
{stage === 'done' && (
Gold reference SQL
{caseData.gold_sql}
)}
>
)}
03 · Why this works
A specialist, verified by execution.
Helix isn't a generalist that happens to write SQL. It's tuned for one job — turning questions
into deployable data pipelines — and every training sample was proven correct against
a real database before training began.
i.
Built for pipelines, not chat.
One job, tuned end-to-end: think → DAG → CTE SQL . Every DAG node maps 1:1 to a CTE — drop the output straight into DBT, Dagster, or Foundry with lineage built in.
ii.
Every sample, proven correct.
11,086 training samples from BIRD and Spider. Each one's SQL was run against a real database and only kept if the rows matched the gold result. No "looks reasonable" data ever reached training.
iii.
Two training stages.
The deployed model is the SFT stage — 50.1% EX, already +9 over Sonnet. Layering reinforcement learning on top, with an execution-based reward, adds another +3.2 points (53.3% ). Same prompt. Same on-prem footprint. Better correctness.
setTweak('liveApi', v)}/>
setTweak('speed', v)}
options={[
{ value: 'fast', label: 'Fast' },
{ value: 'normal', label: 'Normal' },
{ value: 'slow', label: 'Slow' },
]}/>
setTweak('theme', v)}
options={[
{ value: 'light', label: 'Light' },
{ value: 'dark', label: 'Dark' },
]}/>
setTweak('showSchema', v)}/>
{
setTweak('openAllSteps', v);
setHelixOpen([v, v, v, v]);
setSonnetOpen([v, v, v, v]);
}}/>
>
);
}
ReactDOM.createRoot(document.getElementById('root')).render( );