"""
autodeclare.py — AutoDeclare AI-as-Judge Pipeline (integrated)
=================================================================
Single-script implementation of the AutoDeclare design:
1. Load all 28 prompts from prompt_inventory.csv
2. Seed synthetic baseline scores from hardcoded rubric values
3. Call Judge A (prompt quality, 8 dims) via local Ollama for every
substantive prompt — temperature=0, JSON-only output contract
4. Call Judge B (output quality, 8 dims) for any prompt that has a
captured output file in the staging directory
5. Write scores.json to /data/judge_staging/scores/
6. Rebuild rubric_dashboard.html — all original charts PLUS a new
output quality section — fully self-contained, no CDN required
7. Start an HTTP server bound to 127.0.0.1:8080 only (SSH tunnel access)
and print the tunnel command to run on your local machine
Usage
-----
# Full run (prompt quality + output quality if files exist):
python autodeclare.py --run-id 2026-02-27_001
# Score prompts only — no output files required:
python autodeclare.py --prompts-only
# Use a smaller model on a g4dn.xlarge:
python autodeclare.py --run-id 2026-02-27_001 --model mistral-small
# Skip the judge — just rebuild the dashboard from an existing scores file:
python autodeclare.py --dashboard-only --run-id 2026-02-27_001
# Skip serving (write HTML and exit):
python autodeclare.py --no-serve --run-id 2026-02-27_001
Dependencies
------------
pip install openai pandas plotly
Output capture helper (paste into your clinical pipeline)
---------------------------------------------------------
from autodeclare import save_output_for_judge
save_output_for_judge(
prompt_name="reason_for_referral",
output_text=section_text,
run_id=current_run_id,
)
"""
# ═════════════════════════════════════════════════════════════════════════════
# IMPORTS
# ═════════════════════════════════════════════════════════════════════════════
import argparse
import csv
import http.server
import json
import logging
import os
import pathlib
import signal
import socketserver
import sys
import threading
import time
from datetime import datetime, timezone
from typing import Optional
try:
import pandas as pd
import plotly.graph_objects as go
import plotly
from plotly.subplots import make_subplots
except ImportError:
sys.exit("Missing packages. Install with: pip install pandas plotly")
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
# ═════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═════════════════════════════════════════════════════════════════════════════
DEFAULT_BASE_URL = "http://localhost:11434/v1"
DEFAULT_API_KEY = "ollama"
DEFAULT_MODEL = "llama3.3"
DEFAULT_CSV = "prompt_inventory.csv"
DEFAULT_STAGING = "/data/judge_staging"
DEFAULT_PORT = 8080
OUTPUT_HTML = "rubric_dashboard.html"
RETRY_LIMIT = 3
RETRY_DELAY = 2.0
INTER_CALL_DELAY = 0.5
# ── Prompt quality dimensions — must match rubric_dashboard DIM_IDS ──────────
PROMPT_DIMS = [
("instruction_clarity", "Instruction Clarity"),
("constraint_completeness", "Constraint Completeness"),
("output_specificity", "Output Specificity"),
("pii_handling", "PII Token Handling"),
("clinical_safety", "Clinical Safety"),
("duplication_risk", "Duplication Risk*"),
("structural_guidance", "Structural Guidance"),
("prompt_efficiency", "Prompt Efficiency"),
]
PROMPT_DIM_IDS = [d[0] for d in PROMPT_DIMS]
PROMPT_DIM_LABELS = [d[1] for d in PROMPT_DIMS]
# ── Output quality dimensions — new, assessed against generated output ────────
OUTPUT_DIMS = [
("transcript_fidelity", "Transcript Fidelity"),
("no_hallucination", "No Hallucination"),
("clinical_tone", "Clinical Tone"),
("section_completeness", "Section Completeness"),
("pii_token_preserved", "PII Token Preserved"),
("no_unsolicited_diagnosis", "No Unsolicited Diagnosis"),
("british_english", "British English"),
("word_count_adherence", "Word Count Adherence"),
]
OUTPUT_DIM_IDS = [d[0] for d in OUTPUT_DIMS]
OUTPUT_DIM_LABELS = [d[1] for d in OUTPUT_DIMS]
# ── Assessment type → group label ────────────────────────────────────────────
ASSESSMENT_TO_GROUP = {
"ADHD Initial Assessment": "ADHD Initial",
"ADHD Follow-Up": "ADHD Follow-Up",
"ASD Initial Assessment": "ASD Initial",
"ASD - Helper": "Helper",
"ADHD/General - Helper": "Helper",
"All - Helper": "Helper",
"QA": "QA",
"General Adult Psychiatry": "General Adult",
}
# ── Colour palette (matches existing dashboard exactly) ──────────────────────
GROUP_COLORS = {
"ADHD Initial": "#3b82f6",
"ADHD Follow-Up": "#8b5cf6",
"ASD Initial": "#10b981",
"QA": "#f59e0b",
"General Adult": "#ef4444",
"Helper": "#94a3b8",
}
DARK_BG = "#0f1117"
CARD_BG = "#111827"
BORDER = "#1e2a3a"
TXT_MAIN = "#e2e8f0"
TXT_SUB = "#94a3b8"
TXT_DIM = "#475569"
COLORSCALE = [
[0.00, "#ef4444"],
[0.25, "#f97316"],
[0.50, "#fbbf24"],
[0.75, "#34d399"],
[1.00, "#10b981"],
]
SCORE_PAL = ["#ef4444", "#f97316", "#fbbf24", "#34d399", "#10b981"]
# ── Synthetic baseline scores (from original hardcoded rubric_dashboard.py) ──
# Used when the judge cannot run or as the initial seed before a real judge run.
BASELINE_SCORES = {
"reason_for_referral": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Strong example phrase list aids extraction. Missing explicit word count in prompt body. Low duplication risk as referral context is distinct."),
"history_of_presenting_complaint": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Two-task structure discards Task 1 output — adds tokens without value. High duplication risk with Medical/Psychiatric History. Typo: 'transcriptd'."),
"psychiatric_history": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Very thorough extraction criteria. Two-task structure discards Task 1. Overlap risk with Medical History and Drug & Alcohol History addressed downstream."),
"medical_history": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=4, notes="Clear scope. Boundary with Psychiatric History is a known cross-contamination risk addressed downstream by the deduplication prompt."),
"family_medical_psychiatric_history": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=3, structural_guidance=5, prompt_efficiency=4, notes="SCAN reference is a strong clinical anchor. Clear exclusion of patient's own history. Duplication risk mainly from family conditions bleeding into patient Medical History."),
"drug_alcohol_forensic": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=3, pii_handling=5, clinical_safety=5, duplication_risk=3, structural_guidance=4, prompt_efficiency=3, notes="Four high-stakes sections in one prompt — failure in one affects all. Word count guidance is aggregate only. Typo: 'types od substances'."),
"adhd_diagnostic_formulation": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Strongest prompt in the set. Mandatory opening statement is excellent. Binary outcome structure is well-defined. DSM-5 severity anchoring is precise."),
"adhd_follow_up": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=5, pii_handling=5, clinical_safety=3, duplication_risk=2, structural_guidance=5, prompt_efficiency=3, notes="Highly prescriptive paragraph-by-paragraph structure. Clinical safety weaker — no explicit safeguarding instruction. Female-only conditional is a code-level dependency."),
"adhd_ia_deduplication": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=3, clinical_safety=4, duplication_risk=1, structural_guidance=5, prompt_efficiency=4, notes="Excellent JSON schema with confidence scores. Cross-heading risk pairs are well-specified. PII less critical as it operates on already-generated output."),
"asd_deduplication": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=3, clinical_safety=4, duplication_risk=1, structural_guidance=5, prompt_efficiency=4, notes="Near-identical to ADHD deduplication with ASD-specific heading pairs. Both could be unified into a single parameterised prompt."),
"asd_dev_history_social_interaction": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Rich reporting verb list is a nice touch. Combined heading reduces API calls but raises duplication risk. Typo: 'thourough'."),
"asd_social_communication": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Overlap with Developmental History is a significant risk. Expand-most-detailed-example instruction could conflict with word count targets."),
"asd_routines": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=3, structural_guidance=5, prompt_efficiency=4, notes="Good coverage of all sensory domains. Target word count (~500) could be more explicitly embedded in the prompt body."),
"asd_diagnostic_formulation": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Uses ADHD prefix (not ASD prefix) — potential bug. Otherwise mirrors the quality of the ADHD formulation prompt. DSM-5 Level 1/2/3 anchoring is well-specified."),
"asd_dev_social_combined": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=5, structural_guidance=4, prompt_efficiency=2, notes="Highest duplication risk in the set. Combines two sections known to overlap. 1000-2000 word output makes accuracy hard to verify across both sections simultaneously."),
"general_adult_dictation": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=3, pii_handling=4, clinical_safety=4, duplication_risk=2, structural_guidance=5, prompt_efficiency=3, notes="Most flexible prompt — 21 headings with blanks allowed. Output specificity necessarily lower given open-ended dictation context. Manual Instructions heading is a thoughtful edge case handler."),
"general_adult_formulation": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=4, prompt_efficiency=4, notes="Critical safety instruction (no diagnosis not explicitly stated) is well emphasised. Typo in guideline 4: 'DO NOT about the word transcript'."),
"adhd_prompt_prefix": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=3, duplication_risk=1, structural_guidance=3, prompt_efficiency=5, notes="Efficient and well-structured. Clinical safety intentionally minimal — constraints live in child prompts. Could add a safeguarding reminder as a safety net."),
"asd_prompt_prefix": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=3, duplication_risk=1, structural_guidance=3, prompt_efficiency=5, notes="Well-adapted from ADHD prefix for multi-source input. Uses American 'anonymized' instead of British 'anonymised' — inconsistency with the style guide."),
"remove_leading_tabs": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=1, structural_guidance=5, prompt_efficiency=5, notes="Utility function — not an LLM prompt. Scores reflect it performs a single, well-defined mechanical task correctly."),
}
# ═════════════════════════════════════════════════════════════════════════════
# JUDGE SYSTEM PROMPTS
# ═════════════════════════════════════════════════════════════════════════════
PROMPT_QUALITY_SYSTEM = """
You are an expert clinical AI prompt evaluator. Assess the quality of the given
psychiatric report-generation prompt across 8 dimensions.
Respond ONLY with a single valid JSON object. No preamble, no explanation, no
markdown fences. Any non-JSON output will be rejected and retried.
Required JSON schema:
{
"instruction_clarity": <integer 1-5>,
"constraint_completeness": <integer 1-5>,
"output_specificity": <integer 1-5>,
"pii_handling": <integer 1-5>,
"clinical_safety": <integer 1-5>,
"duplication_risk": <integer 1-5>,
"structural_guidance": <integer 1-5>,
"prompt_efficiency": <integer 1-5>,
"notes": "<one concise sentence — the single most important finding>"
}
Scoring guide (1 = very poor, 5 = excellent):
instruction_clarity — 5: every edge case handled explicitly; 1: vague or contradictory
constraint_completeness — 5: all safety/format guardrails present; 1: critical constraints missing
output_specificity — 5: format, length, headings fully defined; 1: no output format specified
pii_handling — 5: PII token protocol stated with examples; 1: no PII instructions
clinical_safety — 5: no unsolicited diagnosis possible; 1: model could infer diagnoses
duplication_risk — 5: LOW risk (good, distinct scope); 1: HIGH risk (content bleeds across sections)
structural_guidance — 5: headings, paragraphs, ordering fully prescribed; 1: no structure specified
prompt_efficiency — 5: minimal tokens, maximum clarity; 1: verbose and redundant
""".strip()
OUTPUT_QUALITY_SYSTEM = """
You are an expert clinical documentation reviewer. Given a psychiatric prompt and
the AI-generated output it produced, assess output quality across 8 dimensions.
Respond ONLY with a single valid JSON object. No preamble, no explanation, no
markdown fences. Any non-JSON output will be rejected and retried.
Required JSON schema:
{
"transcript_fidelity": <integer 1-5>,
"no_hallucination": <integer 1-5>,
"clinical_tone": <integer 1-5>,
"section_completeness": <integer 1-5>,
"pii_token_preserved": <integer 1-5>,
"no_unsolicited_diagnosis": <integer 1-5>,
"british_english": <integer 1-5>,
"word_count_adherence": <integer 1-5>,
"output_notes": "<one concise sentence — the single most important finding>"
}
Scoring guide (1 = very poor, 5 = excellent):
transcript_fidelity — 5: every claim verifiable from transcript; 1: significant content not traceable
no_hallucination — 5: zero invented facts; 1: clear hallucinations present
clinical_tone — 5: formal third-person psychiatric register; 1: colloquial or inappropriate
section_completeness — 5: all required headings populated; 1: key sections absent
pii_token_preserved — 5: all tokens (e.g. {{LOCATION-1}}) intact; 1: tokens modified or expanded
no_unsolicited_diagnosis — 5: no diagnosis not in transcript; 1: diagnosis introduced by model
british_english — 5: consistent British spelling throughout; 1: American English used systematically
word_count_adherence — 5: within 10% of target; 1: >50% over or under (score 3 if no target stated)
""".strip()
# ═════════════════════════════════════════════════════════════════════════════
# LOGGING
# ═════════════════════════════════════════════════════════════════════════════
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("autodeclare")
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 1: OUTPUT CAPTURE HELPER ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def save_output_for_judge(
prompt_name: str,
output_text: str,
run_id: str,
staging_dir: str = DEFAULT_STAGING,
) -> pathlib.Path:
"""
Call this immediately after each Claude API response in your clinical pipeline.
Writes one JSON file per section per run to the judge staging directory.
Example:
section_text = call_claude_api(prompt)
save_output_for_judge("reason_for_referral", section_text, run_id="2026-02-27_001")
"""
staging = pathlib.Path(staging_dir)
out_dir = staging / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)
safe_name = "".join(c if c.isalnum() or c == "_" else "_" for c in prompt_name)
record = {
"prompt_name": prompt_name,
"run_id": run_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"output": output_text,
}
path = out_dir / f"{run_id}_{safe_name}.json"
path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
return path
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 2: LOAD PROMPT INVENTORY ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def load_inventory(csv_path: pathlib.Path) -> dict:
"""Parse prompt_inventory.csv → dict keyed by Prompt Name."""
if not csv_path.exists():
log.error(f"CSV not found: {csv_path}")
sys.exit(1)
prompts = {}
with open(csv_path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
name = row.get("Prompt Name", "").strip()
if name:
prompts[name] = row
log.info(f"Loaded {len(prompts)} prompts from {csv_path.name}")
return prompts
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 3: OLLAMA PREFLIGHT CHECK ──────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def check_ollama(client, model: str) -> bool:
"""
Verify Ollama is running and the requested model is available.
Prints actionable instructions and returns False if either check fails.
"""
try:
models = client.models.list()
available = [m.id for m in models.data]
except Exception as exc:
print("\n" + "─" * 60)
print(" ERROR: Cannot reach Ollama at", client.base_url)
print(" Start it with: OLLAMA_HOST=0.0.0.0:11434 ollama serve &")
print("─" * 60 + "\n")
return False
if model not in available:
print("\n" + "─" * 60)
print(f" ERROR: Model '{model}' not found in Ollama.")
print(f" Pull it with: ollama pull {model}")
print(f" Available: {', '.join(available) or '(none)'}")
print("─" * 60 + "\n")
return False
log.info(f"Ollama OK — model '{model}' available")
return True
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 4: JUDGE CALLS ─────────────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def _call_judge(
client,
model: str,
system: str,
user_content: str,
required_keys: list,
label: str,
) -> Optional[dict]:
"""Core judge call with retry, JSON validation, and score coercion."""
for attempt in range(1, RETRY_LIMIT + 1):
try:
resp = client.chat.completions.create(
model=model,
temperature=0.0,
max_tokens=512,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user_content},
],
response_format={"type": "json_object"},
)
raw = resp.choices[0].message.content.strip()
# Strip accidental markdown fences
if raw.startswith("```"):
parts = raw.split("```")
raw = parts[1].lstrip("json").strip() if len(parts) > 1 else raw
parsed = json.loads(raw)
# Validate all required keys are present
missing = [k for k in required_keys if k not in parsed]
if missing:
raise ValueError(f"Missing keys: {missing}")
# Coerce dimension scores to int (some models return floats/strings)
notes_keys = {"notes", "output_notes"}
for k in required_keys:
if k not in notes_keys:
parsed[k] = max(1, min(5, int(round(float(parsed[k])))))
return parsed
except Exception as exc:
delay = RETRY_DELAY * (2 ** (attempt - 1))
if attempt < RETRY_LIMIT:
log.warning(f"[{label}] attempt {attempt} failed ({exc}). Retry in {delay:.0f}s")
time.sleep(delay)
else:
log.error(f"[{label}] failed after {RETRY_LIMIT} attempts: {exc}")
return None
def judge_prompt_quality(client, model, name, prompt_text):
"""Judge A — evaluate prompt design in isolation."""
user = (
f"Prompt name: {name}\n\n"
f"--- PROMPT TEXT START ---\n{prompt_text}\n--- PROMPT TEXT END ---"
)
return _call_judge(
client, model,
system=PROMPT_QUALITY_SYSTEM,
user_content=user,
required_keys=PROMPT_DIM_IDS + ["notes"],
label=f"{name} / JudgeA",
)
def judge_output_quality(client, model, name, prompt_text, output_text):
"""Judge B — evaluate generated output against prompt intent."""
user = (
f"Prompt name: {name}\n\n"
f"--- PROMPT TEXT START ---\n{prompt_text}\n--- PROMPT TEXT END ---\n\n"
f"--- GENERATED OUTPUT START ---\n{output_text}\n--- GENERATED OUTPUT END ---"
)
return _call_judge(
client, model,
system=OUTPUT_QUALITY_SYSTEM,
user_content=user,
required_keys=OUTPUT_DIM_IDS + ["output_notes"],
label=f"{name} / JudgeB",
)
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 5: SCORING ORCHESTRATION ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def run_scoring(
run_id: str,
csv_path: pathlib.Path,
staging_dir: pathlib.Path,
model: str,
base_url: str,
prompts_only: bool,
use_judge: bool,
) -> list:
"""
Orchestrate all judge calls and return the complete scored list.
Falls back to BASELINE_SCORES when the judge is unavailable or skipped.
"""
prompts = load_inventory(csv_path)
# Scan for captured output files
outputs_dir = staging_dir / "outputs"
output_files = {}
if not prompts_only and outputs_dir.exists():
for path in sorted(outputs_dir.glob(f"{run_id}_*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
pname = data.get("prompt_name", "")
if pname:
output_files[pname] = data
except Exception as exc:
log.warning(f"Cannot read {path.name}: {exc}")
log.info(f"Found {len(output_files)} output file(s) for run '{run_id}'")
# Initialise Ollama client
client = None
if use_judge:
if not OPENAI_AVAILABLE:
log.warning("openai package not installed — falling back to baseline scores")
use_judge = False
else:
client = OpenAI(base_url=base_url, api_key=DEFAULT_API_KEY)
if not check_ollama(client, model):
log.warning("Ollama check failed — falling back to baseline scores")
use_judge = False
all_scores = []
total = len(prompts)
for idx, (pname, prow) in enumerate(prompts.items(), start=1):
prompt_text = prow.get("Full Prompt Content", "").strip()
assessment_type = prow.get("Assessment Type", "")
group = ASSESSMENT_TO_GROUP.get(assessment_type.strip(), assessment_type.strip())
label = prow.get("Prompt Name", pname.replace("_", " ").title())
log.info(f"[{idx}/{total}] {pname}")
# ── Prompt quality ────────────────────────────────────────────────────
if not prompt_text:
# Utility entries (remove_leading_tabs) — use baseline if available
pq = BASELINE_SCORES.get(pname)
log.info(f" Utility entry — using baseline")
elif use_judge:
log.info(f" → Judge A (prompt quality)...")
pq = judge_prompt_quality(client, model, pname, prompt_text)
if pq is None:
log.warning(f" Judge A failed — falling back to baseline")
pq = BASELINE_SCORES.get(pname)
time.sleep(INTER_CALL_DELAY)
else:
pq = BASELINE_SCORES.get(pname)
log.info(f" Using baseline prompt scores")
# ── Output quality ────────────────────────────────────────────────────
oq = None
has_output = pname in output_files
if prompts_only:
pass # skip Judge B entirely
elif has_output and use_judge and prompt_text:
output_text = output_files[pname].get("output", "").strip()
if output_text:
log.info(f" → Judge B (output quality)...")
oq = judge_output_quality(client, model, pname, prompt_text, output_text)
if oq is None:
log.warning(f" Judge B failed — no output quality score recorded")
time.sleep(INTER_CALL_DELAY)
else:
log.warning(f" Output file has empty 'output' field — skipping Judge B")
elif has_output and not use_judge:
log.info(f" Output file exists but judge disabled — skipping Judge B")
all_scores.append({
"name": pname,
"label": label,
"group": group,
"run_id": run_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"has_output": has_output and not prompts_only,
"is_utility": not bool(prompt_text),
"score_source": "judge" if (use_judge and pq is not None and prompt_text) else "baseline",
"prompt_quality": pq,
"output_quality": oq,
})
# Write scores.json
scores_dir = staging_dir / "scores"
scores_dir.mkdir(parents=True, exist_ok=True)
scores_path = scores_dir / f"{run_id}_scores.json"
scores_path.write_text(
json.dumps(all_scores, indent=2, ensure_ascii=False), encoding="utf-8"
)
judged = sum(1 for s in all_scores if s["score_source"] == "judge")
baseline = sum(1 for s in all_scores if s["score_source"] == "baseline")
with_oq = sum(1 for s in all_scores if s["output_quality"] is not None)
log.info(f"Scores: {judged} judge | {baseline} baseline | {with_oq} with output quality")
log.info(f"Saved → {scores_path}")
return all_scores
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 6: DASHBOARD GENERATION ────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
# ── Colour helpers ────────────────────────────────────────────────────────────
def score_color(val: float) -> str:
return SCORE_PAL[max(0, min(4, round(val) - 1))]
def hex_to_rgb(h: str) -> tuple:
h = h.lstrip("#")
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
def adjusted_avg(row: dict, dim_ids: list) -> float:
total = sum((6 - row[d]) if d == "duplication_risk" else row[d] for d in dim_ids)
return round(total / len(dim_ids), 2)
def fig_div(fig) -> str:
return fig.to_html(full_html=False, include_plotlyjs=False)
# ── Chart builders ────────────────────────────────────────────────────────────
def build_heatmap(df: pd.DataFrame, dim_ids: list, dim_labels: list, title: str):
z = df[dim_ids].values.tolist()
labels = df["label"].tolist()
fig = go.Figure(go.Heatmap(
z=z, x=dim_labels, y=labels,
text=[[str(v) for v in row] for row in z],
texttemplate="%{text}",
textfont=dict(size=11, color="white", family="monospace"),
colorscale=COLORSCALE, zmin=1, zmax=5,
colorbar=dict(
title=dict(text="Score", font=dict(color=TXT_SUB)),
tickfont=dict(color=TXT_SUB, family="monospace"),
bgcolor=CARD_BG, bordercolor=BORDER, thickness=14,
),
hovertemplate="<b>%{y}</b><br>%{x}: %{z}/5<extra></extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=10),
xaxis=dict(tickangle=-35, showgrid=False, tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=10, color=TXT_MAIN), autorange="reversed"),
margin=dict(l=310, r=80, t=60, b=140), height=680,
)
return fig
def build_bar(df: pd.DataFrame, title: str):
df_s = df.sort_values("adj_avg")
fig = go.Figure(go.Bar(
x=df_s["adj_avg"], y=df_s["label"], orientation="h",
marker=dict(
color=[GROUP_COLORS.get(g, "#6b7280") for g in df_s["group"]],
line=dict(width=0),
),
text=df_s["adj_avg"].astype(str), textposition="outside",
textfont=dict(color=TXT_SUB, size=10, family="monospace"),
hovertemplate="<b>%{y}</b><br>Adj. avg: %{x}<extra></extra>",
))
for grp, col in GROUP_COLORS.items():
fig.add_trace(go.Scatter(x=[None], y=[None], mode="markers",
marker=dict(size=10, color=col), name=grp))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=10),
xaxis=dict(range=[0, 5.8], showgrid=True, gridcolor=BORDER,
tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=10, color=TXT_MAIN)),
legend=dict(bgcolor=CARD_BG, bordercolor=BORDER, borderwidth=1,
font=dict(color=TXT_SUB, size=10)),
margin=dict(l=310, r=60, t=60, b=40), height=620,
)
return fig
def build_dim_averages(df: pd.DataFrame, dimensions: list, title: str):
dim_avgs = []
for dim, lbl in dimensions:
avg_val = round((6 - df[dim]).mean() if dim == "duplication_risk" else df[dim].mean(), 2)
dim_avgs.append((lbl + (" (inv.)" if dim == "duplication_risk" else ""), avg_val))
dim_avgs.sort(key=lambda x: x[1])
fig = go.Figure(go.Bar(
x=[v for _, v in dim_avgs], y=[n for n, _ in dim_avgs], orientation="h",
marker=dict(color=[score_color(v) for _, v in dim_avgs], line=dict(width=0)),
text=[str(v) for _, v in dim_avgs], textposition="outside",
textfont=dict(color=TXT_SUB, size=11, family="monospace"),
hovertemplate="<b>%{y}</b><br>Average: %{x}<extra></extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=11),
xaxis=dict(range=[0, 5.6], showgrid=True, gridcolor=BORDER,
tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=11, color=TXT_MAIN)),
margin=dict(l=240, r=60, t=60, b=40), height=400,
)
return fig
def build_radar_overlay(df: pd.DataFrame, dim_ids: list, dim_labels: list, title: str):
theta_closed = dim_labels + [dim_labels[0]]
fig = go.Figure()
for grp, col in GROUP_COLORS.items():
gdf = df[df["group"] == grp]
if gdf.empty:
continue
scores = [round(gdf[d].mean(), 2) for d in dim_ids]
r_c = scores + [scores[0]]
r, g, b = hex_to_rgb(col)
fig.add_trace(go.Scatterpolar(
r=r_c, theta=theta_closed, fill="toself",
fillcolor=f"rgba({r},{g},{b},0.12)", line=dict(color=col, width=2),
name=grp,
hovertemplate="%{theta}: %{r:.2f}<extra>" + grp + "</extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
polar=dict(
bgcolor="#0d1117",
radialaxis=dict(visible=True, range=[0, 5], showticklabels=False,
gridcolor=BORDER, linecolor=BORDER),
angularaxis=dict(tickfont=dict(color=TXT_SUB, size=10, family="monospace"),
gridcolor=BORDER, linecolor=BORDER),
),
paper_bgcolor=CARD_BG,
legend=dict(bgcolor=CARD_BG, bordercolor=BORDER, borderwidth=1,
font=dict(color=TXT_SUB, size=11)),
margin=dict(l=60, r=60, t=60, b=60), height=500,
)
return fig
def build_radar_grid(prompts_list: list, dim_ids: list, dim_labels: list, title: str):
theta_closed = dim_labels + [dim_labels[0]]
COLS = 4
N = len(prompts_list)
ROWS = -(-N // COLS)
specs = [[{"type": "polar"}] * COLS for _ in range(ROWS)]
titles = [p["label"] for p in prompts_list] + [""] * (ROWS * COLS - N)
fig = make_subplots(rows=ROWS, cols=COLS, specs=specs,
subplot_titles=titles,
vertical_spacing=0.08, horizontal_spacing=0.04)
for i, pd_ in enumerate(prompts_list):
ri = i // COLS + 1
ci = i % COLS + 1
scores = [pd_[d] for d in dim_ids]
r_c = scores + [scores[0]]
col = GROUP_COLORS.get(pd_["group"], "#6b7280")
r, g, b = hex_to_rgb(col)
fig.add_trace(go.Scatterpolar(
r=r_c, theta=theta_closed, fill="toself",
fillcolor=f"rgba({r},{g},{b},0.18)", line=dict(color=col, width=1.5),
name=pd_["label"], showlegend=False,
hovertemplate="%{theta}: %{r}<extra>" + pd_["label"] + "</extra>",
), row=ri, col=ci)
for i in range(1, ROWS * COLS + 1):
key = f"polar{i}" if i > 1 else "polar"
fig.update_layout(**{key: dict(
bgcolor="#0d1117",
radialaxis=dict(visible=True, range=[0, 5], showticklabels=False,
gridcolor=BORDER, linecolor=BORDER),
angularaxis=dict(tickfont=dict(color=TXT_DIM, size=7),
gridcolor=BORDER, linecolor=BORDER),
)})
for ann in fig.layout.annotations:
ann.font = dict(color=TXT_SUB, size=9, family="monospace")
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
paper_bgcolor=CARD_BG, height=ROWS * 200 + 80,
margin=dict(l=20, r=20, t=60, b=20),
)
return fig
def build_notes_table(df: pd.DataFrame, notes_col: str, html_border: str) -> str:
rows_html = ""
for _, row in df.sort_values("adj_avg").iterrows():
col = GROUP_COLORS.get(row["group"], "#6b7280")
avg = row["adj_avg"]
acol = score_color(avg)
note = row.get(notes_col, "") or ""
rows_html += f"""
<tr>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};">
<span style="font-size:10px;color:{col};font-family:monospace;
text-transform:uppercase;display:block;margin-bottom:3px;">{row['group']}</span>
<span style="font-size:12px;color:{TXT_MAIN};">{row['label']}</span>
</td>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};font-size:18px;
font-weight:bold;font-family:monospace;color:{acol};text-align:center;">{avg}</td>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};font-size:12px;
color:{TXT_SUB};line-height:1.6;font-family:'Helvetica Neue',sans-serif;">{note}</td>
</tr>"""
return rows_html
def build_score_source_badge(source: str) -> str:
if source == "judge":
return f'<span style="background:#064e3b;color:#6ee7b7;font-size:10px;font-family:monospace;padding:2px 8px;border-radius:4px;">judge</span>'
return f'<span style="background:#1e3a5f;color:#93c5fd;font-size:10px;font-family:monospace;padding:2px 8px;border-radius:4px;">baseline</span>'
def generate_dashboard(all_scores: list, output_path: pathlib.Path, run_id: str) -> None:
"""Build the complete self-contained HTML dashboard from scored data."""
# ── Prompt quality DataFrame (all entries with scores) ───────────────────
pq_rows = []
for s in all_scores:
pq = s.get("prompt_quality")
if pq and all(k in pq for k in PROMPT_DIM_IDS):
row = {
"name": s["name"],
"label": s["label"],
"group": s["group"],
"source": s.get("score_source", "baseline"),
"notes": pq.get("notes", ""),
}
for dim in PROMPT_DIM_IDS:
row[dim] = pq[dim]
row["adj_avg"] = adjusted_avg(row, PROMPT_DIM_IDS)
pq_rows.append(row)
# ── Output quality DataFrame (only entries with oq scores) ───────────────
oq_rows = []
for s in all_scores:
oq = s.get("output_quality")
if oq and all(k in oq for k in OUTPUT_DIM_IDS):
row = {
"name": s["name"],
"label": s["label"],
"group": s["group"],
"notes": oq.get("output_notes", ""),
}
for dim in OUTPUT_DIM_IDS:
row[dim] = oq[dim]
row["adj_avg"] = adjusted_avg(row, OUTPUT_DIM_IDS)
oq_rows.append(row)
if not pq_rows:
log.error("No scoreable prompts found — cannot generate dashboard")
return
df_pq = pd.DataFrame(pq_rows)
has_oq = bool(oq_rows)
df_oq = pd.DataFrame(oq_rows) if has_oq else None
n_prompts = len(df_pq)
n_judged = sum(1 for s in all_scores if s.get("score_source") == "judge")
# ── Build all charts ──────────────────────────────────────────────────────
log.info("Generating charts...")
# Prompt quality charts (original set)
fig_pq_heatmap = build_heatmap(df_pq, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Score Heatmap")
fig_pq_bar = build_bar(df_pq, "Prompt Quality — Adjusted Average by Prompt")
fig_pq_dims = build_dim_averages(df_pq, PROMPT_DIMS,
f"Prompt Quality — Dimension Averages ({n_prompts} Prompts)")
fig_pq_radar = build_radar_overlay(df_pq, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Group Radar Overlay")
fig_pq_grid = build_radar_grid(pq_rows, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Individual Radar Grid")
pq_notes_html = build_notes_table(df_pq, "notes", BORDER)
# Output quality charts (new section — only if data exists)
oq_section = ""
if has_oq:
fig_oq_heatmap = build_heatmap(df_oq, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Score Heatmap")
fig_oq_bar = build_bar(df_oq, "Output Quality — Adjusted Average by Prompt")
fig_oq_dims = build_dim_averages(df_oq, OUTPUT_DIMS,
f"Output Quality — Dimension Averages ({len(oq_rows)} Prompts)")
fig_oq_radar = build_radar_overlay(df_oq, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Group Radar Overlay")
fig_oq_grid = build_radar_grid(oq_rows, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Individual Radar Grid")
oq_notes_html = build_notes_table(df_oq, "notes", BORDER)
oq_section = f"""
<div class="section-header">
<h2 class="section-title">Output Quality</h2>
<p class="section-sub">Assessed by Judge B against transcript fidelity, clinical tone, PII preservation, and more.
<span style="color:{TXT_DIM};font-size:11px;"> · {len(oq_rows)} outputs scored</span>
</p>
</div>
<div class="card"><h2 class="st">Output Quality — Score Heatmap</h2>{fig_div(fig_oq_heatmap)}</div>
<div class="two-col">
<div class="card"><h2 class="st">Adjusted Average by Prompt</h2>{fig_div(fig_oq_bar)}</div>
<div class="card"><h2 class="st">Dimension Averages</h2>{fig_div(fig_oq_dims)}</div>
</div>
<div class="two-col">
<div class="card"><h2 class="st">Group Radar Overlay</h2>{fig_div(fig_oq_radar)}</div>
<div class="card"><h2 class="st">Individual Radar Grid</h2>{fig_div(fig_oq_grid)}</div>
</div>
<div class="card">
<h2 class="st">Output Quality — Reviewer Notes</h2>
<table>
<thead><tr>
<th style="width:240px;">Prompt</th>
<th style="width:80px;text-align:center;">Adj. Avg</th>
<th>Notes</th>
</tr></thead>
<tbody>{oq_notes_html}</tbody>
</table>
</div>"""
# ── Score source legend for prompt quality notes table ────────────────────
source_col_html = ""
for _, row in df_pq.sort_values("adj_avg").iterrows():
badge = build_score_source_badge(row.get("source", "baseline"))
source_col_html += f"""
<tr><td style="padding:6px 12px;border-bottom:1px solid {BORDER};
font-size:11px;color:{TXT_SUB};font-family:monospace;">{row['label']}</td>
<td style="padding:6px 12px;border-bottom:1px solid {BORDER};">{badge}</td></tr>"""
# ── Load bundled Plotly JS ────────────────────────────────────────────────
plotly_js_path = os.path.join(
os.path.dirname(plotly.__file__), "package_data", "plotly.min.js"
)
with open(plotly_js_path, "r", encoding="utf-8") as f:
plotly_js = f.read()
# ── Assemble HTML ─────────────────────────────────────────────────────────
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AutoDeclare — Rubric Dashboard</title>
<script>{plotly_js}</script>
<style>
*,*::before,*::after{{box-sizing:border-box;margin:0;padding:0;}}
body {{background:{DARK_BG};color:{TXT_MAIN};font-family:Georgia,serif;}}
header {{background:#1a1f2e;border-bottom:1px solid {BORDER};padding:28px 40px;}}
header p.eyebrow {{color:{TXT_DIM};font-size:11px;letter-spacing:.2em;
font-family:monospace;text-transform:uppercase;margin-bottom:6px;}}
header h1 {{font-size:28px;font-weight:normal;color:{TXT_MAIN};margin-bottom:6px;}}
header p.sub {{color:{TXT_DIM};font-size:12px;font-family:'Helvetica Neue',sans-serif;}}
main {{max-width:1300px;margin:0 auto;padding:32px 40px;}}
.card {{background:{CARD_BG};border:1px solid {BORDER};border-radius:10px;
padding:20px;margin-bottom:24px;}}
h2.st {{font-size:13px;font-weight:normal;color:{TXT_SUB};font-family:monospace;
letter-spacing:.1em;text-transform:uppercase;margin-bottom:16px;}}
.two-col {{display:grid;grid-template-columns:1fr 1fr;gap:24px;margin-bottom:24px;}}
.section-header {{margin:40px 0 20px;padding-bottom:12px;
border-bottom:2px solid {BORDER};}}
.section-title {{font-size:11px;font-family:monospace;letter-spacing:.2em;
text-transform:uppercase;color:#64748b;margin-bottom:6px;}}
.section-sub {{font-size:13px;color:{TXT_DIM};font-family:'Helvetica Neue',sans-serif;}}
table {{width:100%;border-collapse:collapse;}}
th {{text-align:left;padding:8px 12px;color:{TXT_DIM};font-size:11px;
font-family:monospace;letter-spacing:.05em;border-bottom:2px solid {BORDER};}}
.meta-grid {{display:grid;grid-template-columns:repeat(4,1fr);gap:12px;margin-bottom:24px;}}
.meta-card {{background:{CARD_BG};border:1px solid {BORDER};border-radius:8px;
padding:14px 16px;}}
.meta-card .label {{font-size:10px;font-family:monospace;letter-spacing:.12em;
text-transform:uppercase;color:{TXT_DIM};margin-bottom:6px;}}
.meta-card .value {{font-size:20px;font-family:monospace;color:{TXT_MAIN};}}
@media(max-width:900px){{.two-col{{grid-template-columns:1fr;}}
.meta-grid{{grid-template-columns:repeat(2,1fr);}}main{{padding:16px;}}}}
</style>
</head>
<body>
<header>
<p class="eyebrow">AutoDeclare · AI-as-Judge Pipeline</p>
<h1>Rubric Dashboard</h1>
<p class="sub">Run: <code style="color:{TXT_MAIN};">{run_id}</code>
· {n_prompts} prompts
· {n_judged} judge-scored
· {len(oq_rows) if has_oq else 0} outputs assessed
· * Duplication Risk is inverted — lower score = lower risk = better
</p>
</header>
<main>
<!-- ── Meta summary cards ── -->
<div class="meta-grid">
<div class="meta-card">
<div class="label">Total Prompts</div>
<div class="value">{n_prompts}</div>
</div>
<div class="meta-card">
<div class="label">Judge-Scored</div>
<div class="value" style="color:#10b981;">{n_judged}</div>
</div>
<div class="meta-card">
<div class="label">Baseline-Seeded</div>
<div class="value" style="color:#3b82f6;">{n_prompts - n_judged}</div>
</div>
<div class="meta-card">
<div class="label">Outputs Assessed</div>
<div class="value" style="color:#f59e0b;">{len(oq_rows) if has_oq else 0}</div>
</div>
</div>
<!-- ══ PROMPT QUALITY SECTION ══ -->
<div class="section-header">
<h2 class="section-title">Prompt Quality</h2>
<p class="section-sub">Assessed by Judge A — prompt design evaluated in isolation across 8 rubric dimensions.</p>
</div>
<div class="card"><h2 class="st">Score Heatmap — All Prompts × All Dimensions</h2>{fig_div(fig_pq_heatmap)}</div>
<div class="two-col">
<div class="card"><h2 class="st">Adjusted Average by Prompt</h2>{fig_div(fig_pq_bar)}</div>
<div class="card"><h2 class="st">Dimension Averages — All Prompts</h2>{fig_div(fig_pq_dims)}</div>
</div>
<div class="two-col">
<div class="card"><h2 class="st">Group Radar Overlay</h2>{fig_div(fig_pq_radar)}</div>
<div class="card"><h2 class="st">Individual Prompt Radar Grid</h2>{fig_div(fig_pq_grid)}</div>
</div>
<div class="card">
<h2 class="st">Reviewer Notes</h2>
<table>
<thead><tr>
<th style="width:240px;">Prompt</th>
<th style="width:80px;text-align:center;">Adj. Avg</th>
<th>Notes</th>
</tr></thead>
<tbody>{pq_notes_html}</tbody>
</table>
</div>
<!-- ── Score source legend ── -->
<div class="card">
<h2 class="st">Score Sources</h2>
<p style="font-size:13px;color:{TXT_DIM};margin-bottom:12px;">
Indicates whether each prompt's score came from the live Ollama judge or the
hardcoded baseline. Run with <code>--run-id</code> and Ollama available to replace baseline entries.
</p>
<table style="max-width:500px;">
<thead><tr><th>Prompt</th><th>Source</th></tr></thead>
<tbody>{source_col_html}</tbody>
</table>
</div>
<!-- ══ OUTPUT QUALITY SECTION ══ -->
{oq_section if has_oq else f'''
<div class="section-header">
<h2 class="section-title">Output Quality</h2>
<p class="section-sub" style="color:{TXT_DIM};">
No output files found for run <code>{run_id}</code>.
Add <code>save_output_for_judge()</code> calls to your pipeline and re-run
with <code>--run-id {run_id}</code> to populate this section.
</p>
</div>'''}
</main>
</body>
</html>"""
output_path.write_text(html, encoding="utf-8")
size_kb = output_path.stat().st_size // 1024
log.info(f"Dashboard written → {output_path} ({size_kb} KB)")
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 7: SSH TUNNEL SERVER ────────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
class _QuietHandler(http.server.SimpleHTTPRequestHandler):
"""Minimal HTTP handler — suppresses request logs, serves from html directory."""
def __init__(self, *args, directory=None, **kwargs):
super().__init__(*args, directory=directory, **kwargs)
def log_message(self, fmt, *args):
pass # silence per-request logs
def serve_dashboard(output_path: pathlib.Path, port: int) -> None:
"""
Bind an HTTP server to 127.0.0.1 only (never 0.0.0.0) and print the
SSH tunnel command required to access it from a remote machine.
"""
directory = str(output_path.parent.resolve())
filename = output_path.name
url = f"http://127.0.0.1:{port}/{filename}"
def handler_factory(*args, **kwargs):
return _QuietHandler(*args, directory=directory, **kwargs)
socketserver.TCPServer.allow_reuse_address = True
try:
httpd = socketserver.TCPServer(("127.0.0.1", port), handler_factory)
except OSError as exc:
log.error(f"Cannot bind to port {port}: {exc}")
log.error(f"Try: python autodeclare.py --port {port + 1}")
return
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
thread.start()
ec2_ip = "<your-ec2-public-ip>"
print()
print("─" * 64)
print(" AutoDeclare Dashboard — Ready")
print("─" * 64)
print()
print(" The server is bound to 127.0.0.1 only (SSH tunnel required).")
print()
print(" Run this command on your LOCAL machine:")
print()
print(f" ssh -L {port}:127.0.0.1:{port} ubuntu@{ec2_ip}")
print()
print(" Then open this URL in your local browser:")
print()
print(f" {url}")
print()
print(" Press Ctrl+C to stop the server.")
print("─" * 64)
print()
try:
signal.pause()
except (AttributeError, OSError):
try:
input()
except (EOFError, KeyboardInterrupt):
pass
finally:
httpd.shutdown()
log.info("Server stopped.")
# ═════════════════════════════════════════════════════════════════════════════
# CLI
# ═════════════════════════════════════════════════════════════════════════════
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="AutoDeclare — AI-as-Judge pipeline + Rubric Dashboard",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python autodeclare.py --run-id 2026-02-27_001
python autodeclare.py --prompts-only
python autodeclare.py --dashboard-only --run-id 2026-02-27_001
python autodeclare.py --no-serve --run-id 2026-02-27_001
python autodeclare.py --run-id 2026-02-27_001 --model mistral-small
""",
)
p.add_argument("--run-id", default="latest",
help="Run ID for output file matching (default: 'latest')")
p.add_argument("--csv", default=DEFAULT_CSV,
help=f"Path to prompt_inventory.csv (default: {DEFAULT_CSV})")
p.add_argument("--staging", default=DEFAULT_STAGING,
help=f"Staging directory root (default: {DEFAULT_STAGING})")
p.add_argument("--model", default=DEFAULT_MODEL,
help=f"Ollama model name (default: {DEFAULT_MODEL})")
p.add_argument("--base-url", default=DEFAULT_BASE_URL,
help=f"Ollama base URL (default: {DEFAULT_BASE_URL})")
p.add_argument("--port", default=DEFAULT_PORT, type=int,
help=f"Dashboard server port (default: {DEFAULT_PORT})")
p.add_argument("--prompts-only", action="store_true",
help="Score prompts only — skip Judge B")
p.add_argument("--dashboard-only",action="store_true",
help="Skip judging — rebuild dashboard from existing scores.json")
p.add_argument("--no-serve", action="store_true",
help="Write HTML and exit — do not start the server")
p.add_argument("--no-judge", action="store_true",
help="Use baseline scores only — do not call Ollama")
return p.parse_args()
def main() -> None:
args = parse_args()
staging_dir = pathlib.Path(args.staging)
staging_dir.mkdir(parents=True, exist_ok=True)
output_path = pathlib.Path(OUTPUT_HTML)
# ── Dashboard-only mode: load existing scores.json ────────────────────────
if args.dashboard_only:
scores_path = staging_dir / "scores" / f"{args.run_id}_scores.json"
if not scores_path.exists():
log.error(f"Scores file not found: {scores_path}")
log.error("Run without --dashboard-only first to generate scores.")
sys.exit(1)
all_scores = json.loads(scores_path.read_text(encoding="utf-8"))
log.info(f"Loaded {len(all_scores)} entries from {scores_path.name}")
# ── Full pipeline: judge + score ──────────────────────────────────────────
else:
use_judge = not args.no_judge
all_scores = run_scoring(
run_id = args.run_id,
csv_path = pathlib.Path(args.csv),
staging_dir = staging_dir,
model = args.model,
base_url = args.base_url,
prompts_only= args.prompts_only,
use_judge = use_judge,
)
# ── Generate dashboard ────────────────────────────────────────────────────
generate_dashboard(all_scores, output_path, args.run_id)
# ── Serve (unless --no-serve) ─────────────────────────────────────────────
if not args.no_serve:
serve_dashboard(output_path, args.port)
if __name__ == "__main__":
main()
autodeclare.py — AutoDeclare AI-as-Judge Pipeline (integrated)
=================================================================
Single-script implementation of the AutoDeclare design:
1. Load all 28 prompts from prompt_inventory.csv
2. Seed synthetic baseline scores from hardcoded rubric values
3. Call Judge A (prompt quality, 8 dims) via local Ollama for every
substantive prompt — temperature=0, JSON-only output contract
4. Call Judge B (output quality, 8 dims) for any prompt that has a
captured output file in the staging directory
5. Write scores.json to /data/judge_staging/scores/
6. Rebuild rubric_dashboard.html — all original charts PLUS a new
output quality section — fully self-contained, no CDN required
7. Start an HTTP server bound to 127.0.0.1:8080 only (SSH tunnel access)
and print the tunnel command to run on your local machine
Usage
-----
# Full run (prompt quality + output quality if files exist):
python autodeclare.py --run-id 2026-02-27_001
# Score prompts only — no output files required:
python autodeclare.py --prompts-only
# Use a smaller model on a g4dn.xlarge:
python autodeclare.py --run-id 2026-02-27_001 --model mistral-small
# Skip the judge — just rebuild the dashboard from an existing scores file:
python autodeclare.py --dashboard-only --run-id 2026-02-27_001
# Skip serving (write HTML and exit):
python autodeclare.py --no-serve --run-id 2026-02-27_001
Dependencies
------------
pip install openai pandas plotly
Output capture helper (paste into your clinical pipeline)
---------------------------------------------------------
from autodeclare import save_output_for_judge
save_output_for_judge(
prompt_name="reason_for_referral",
output_text=section_text,
run_id=current_run_id,
)
"""
# ═════════════════════════════════════════════════════════════════════════════
# IMPORTS
# ═════════════════════════════════════════════════════════════════════════════
import argparse
import csv
import http.server
import json
import logging
import os
import pathlib
import signal
import socketserver
import sys
import threading
import time
from datetime import datetime, timezone
from typing import Optional
try:
import pandas as pd
import plotly.graph_objects as go
import plotly
from plotly.subplots import make_subplots
except ImportError:
sys.exit("Missing packages. Install with: pip install pandas plotly")
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
# ═════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═════════════════════════════════════════════════════════════════════════════
DEFAULT_BASE_URL = "http://localhost:11434/v1"
DEFAULT_API_KEY = "ollama"
DEFAULT_MODEL = "llama3.3"
DEFAULT_CSV = "prompt_inventory.csv"
DEFAULT_STAGING = "/data/judge_staging"
DEFAULT_PORT = 8080
OUTPUT_HTML = "rubric_dashboard.html"
RETRY_LIMIT = 3
RETRY_DELAY = 2.0
INTER_CALL_DELAY = 0.5
# ── Prompt quality dimensions — must match rubric_dashboard DIM_IDS ──────────
PROMPT_DIMS = [
("instruction_clarity", "Instruction Clarity"),
("constraint_completeness", "Constraint Completeness"),
("output_specificity", "Output Specificity"),
("pii_handling", "PII Token Handling"),
("clinical_safety", "Clinical Safety"),
("duplication_risk", "Duplication Risk*"),
("structural_guidance", "Structural Guidance"),
("prompt_efficiency", "Prompt Efficiency"),
]
PROMPT_DIM_IDS = [d[0] for d in PROMPT_DIMS]
PROMPT_DIM_LABELS = [d[1] for d in PROMPT_DIMS]
# ── Output quality dimensions — new, assessed against generated output ────────
OUTPUT_DIMS = [
("transcript_fidelity", "Transcript Fidelity"),
("no_hallucination", "No Hallucination"),
("clinical_tone", "Clinical Tone"),
("section_completeness", "Section Completeness"),
("pii_token_preserved", "PII Token Preserved"),
("no_unsolicited_diagnosis", "No Unsolicited Diagnosis"),
("british_english", "British English"),
("word_count_adherence", "Word Count Adherence"),
]
OUTPUT_DIM_IDS = [d[0] for d in OUTPUT_DIMS]
OUTPUT_DIM_LABELS = [d[1] for d in OUTPUT_DIMS]
# ── Assessment type → group label ────────────────────────────────────────────
ASSESSMENT_TO_GROUP = {
"ADHD Initial Assessment": "ADHD Initial",
"ADHD Follow-Up": "ADHD Follow-Up",
"ASD Initial Assessment": "ASD Initial",
"ASD - Helper": "Helper",
"ADHD/General - Helper": "Helper",
"All - Helper": "Helper",
"QA": "QA",
"General Adult Psychiatry": "General Adult",
}
# ── Colour palette (matches existing dashboard exactly) ──────────────────────
GROUP_COLORS = {
"ADHD Initial": "#3b82f6",
"ADHD Follow-Up": "#8b5cf6",
"ASD Initial": "#10b981",
"QA": "#f59e0b",
"General Adult": "#ef4444",
"Helper": "#94a3b8",
}
DARK_BG = "#0f1117"
CARD_BG = "#111827"
BORDER = "#1e2a3a"
TXT_MAIN = "#e2e8f0"
TXT_SUB = "#94a3b8"
TXT_DIM = "#475569"
COLORSCALE = [
[0.00, "#ef4444"],
[0.25, "#f97316"],
[0.50, "#fbbf24"],
[0.75, "#34d399"],
[1.00, "#10b981"],
]
SCORE_PAL = ["#ef4444", "#f97316", "#fbbf24", "#34d399", "#10b981"]
# ── Synthetic baseline scores (from original hardcoded rubric_dashboard.py) ──
# Used when the judge cannot run or as the initial seed before a real judge run.
BASELINE_SCORES = {
"reason_for_referral": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Strong example phrase list aids extraction. Missing explicit word count in prompt body. Low duplication risk as referral context is distinct."),
"history_of_presenting_complaint": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Two-task structure discards Task 1 output — adds tokens without value. High duplication risk with Medical/Psychiatric History. Typo: 'transcriptd'."),
"psychiatric_history": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Very thorough extraction criteria. Two-task structure discards Task 1. Overlap risk with Medical History and Drug & Alcohol History addressed downstream."),
"medical_history": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=4, notes="Clear scope. Boundary with Psychiatric History is a known cross-contamination risk addressed downstream by the deduplication prompt."),
"family_medical_psychiatric_history": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=3, structural_guidance=5, prompt_efficiency=4, notes="SCAN reference is a strong clinical anchor. Clear exclusion of patient's own history. Duplication risk mainly from family conditions bleeding into patient Medical History."),
"drug_alcohol_forensic": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=3, pii_handling=5, clinical_safety=5, duplication_risk=3, structural_guidance=4, prompt_efficiency=3, notes="Four high-stakes sections in one prompt — failure in one affects all. Word count guidance is aggregate only. Typo: 'types od substances'."),
"adhd_diagnostic_formulation": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Strongest prompt in the set. Mandatory opening statement is excellent. Binary outcome structure is well-defined. DSM-5 severity anchoring is precise."),
"adhd_follow_up": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=5, pii_handling=5, clinical_safety=3, duplication_risk=2, structural_guidance=5, prompt_efficiency=3, notes="Highly prescriptive paragraph-by-paragraph structure. Clinical safety weaker — no explicit safeguarding instruction. Female-only conditional is a code-level dependency."),
"adhd_ia_deduplication": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=3, clinical_safety=4, duplication_risk=1, structural_guidance=5, prompt_efficiency=4, notes="Excellent JSON schema with confidence scores. Cross-heading risk pairs are well-specified. PII less critical as it operates on already-generated output."),
"asd_deduplication": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=3, clinical_safety=4, duplication_risk=1, structural_guidance=5, prompt_efficiency=4, notes="Near-identical to ADHD deduplication with ASD-specific heading pairs. Both could be unified into a single parameterised prompt."),
"asd_dev_history_social_interaction": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Rich reporting verb list is a nice touch. Combined heading reduces API calls but raises duplication risk. Typo: 'thourough'."),
"asd_social_communication": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=4, structural_guidance=4, prompt_efficiency=3, notes="Overlap with Developmental History is a significant risk. Expand-most-detailed-example instruction could conflict with word count targets."),
"asd_routines": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=3, structural_guidance=5, prompt_efficiency=4, notes="Good coverage of all sensory domains. Target word count (~500) could be more explicitly embedded in the prompt body."),
"asd_diagnostic_formulation": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=5, prompt_efficiency=4, notes="Uses ADHD prefix (not ASD prefix) — potential bug. Otherwise mirrors the quality of the ADHD formulation prompt. DSM-5 Level 1/2/3 anchoring is well-specified."),
"asd_dev_social_combined": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=4, duplication_risk=5, structural_guidance=4, prompt_efficiency=2, notes="Highest duplication risk in the set. Combines two sections known to overlap. 1000-2000 word output makes accuracy hard to verify across both sections simultaneously."),
"general_adult_dictation": dict(instruction_clarity=4, constraint_completeness=4, output_specificity=3, pii_handling=4, clinical_safety=4, duplication_risk=2, structural_guidance=5, prompt_efficiency=3, notes="Most flexible prompt — 21 headings with blanks allowed. Output specificity necessarily lower given open-ended dictation context. Manual Instructions heading is a thoughtful edge case handler."),
"general_adult_formulation": dict(instruction_clarity=4, constraint_completeness=5, output_specificity=4, pii_handling=5, clinical_safety=5, duplication_risk=2, structural_guidance=4, prompt_efficiency=4, notes="Critical safety instruction (no diagnosis not explicitly stated) is well emphasised. Typo in guideline 4: 'DO NOT about the word transcript'."),
"adhd_prompt_prefix": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=3, duplication_risk=1, structural_guidance=3, prompt_efficiency=5, notes="Efficient and well-structured. Clinical safety intentionally minimal — constraints live in child prompts. Could add a safeguarding reminder as a safety net."),
"asd_prompt_prefix": dict(instruction_clarity=5, constraint_completeness=4, output_specificity=4, pii_handling=5, clinical_safety=3, duplication_risk=1, structural_guidance=3, prompt_efficiency=5, notes="Well-adapted from ADHD prefix for multi-source input. Uses American 'anonymized' instead of British 'anonymised' — inconsistency with the style guide."),
"remove_leading_tabs": dict(instruction_clarity=5, constraint_completeness=5, output_specificity=5, pii_handling=5, clinical_safety=5, duplication_risk=1, structural_guidance=5, prompt_efficiency=5, notes="Utility function — not an LLM prompt. Scores reflect it performs a single, well-defined mechanical task correctly."),
}
# ═════════════════════════════════════════════════════════════════════════════
# JUDGE SYSTEM PROMPTS
# ═════════════════════════════════════════════════════════════════════════════
PROMPT_QUALITY_SYSTEM = """
You are an expert clinical AI prompt evaluator. Assess the quality of the given
psychiatric report-generation prompt across 8 dimensions.
Respond ONLY with a single valid JSON object. No preamble, no explanation, no
markdown fences. Any non-JSON output will be rejected and retried.
Required JSON schema:
{
"instruction_clarity": <integer 1-5>,
"constraint_completeness": <integer 1-5>,
"output_specificity": <integer 1-5>,
"pii_handling": <integer 1-5>,
"clinical_safety": <integer 1-5>,
"duplication_risk": <integer 1-5>,
"structural_guidance": <integer 1-5>,
"prompt_efficiency": <integer 1-5>,
"notes": "<one concise sentence — the single most important finding>"
}
Scoring guide (1 = very poor, 5 = excellent):
instruction_clarity — 5: every edge case handled explicitly; 1: vague or contradictory
constraint_completeness — 5: all safety/format guardrails present; 1: critical constraints missing
output_specificity — 5: format, length, headings fully defined; 1: no output format specified
pii_handling — 5: PII token protocol stated with examples; 1: no PII instructions
clinical_safety — 5: no unsolicited diagnosis possible; 1: model could infer diagnoses
duplication_risk — 5: LOW risk (good, distinct scope); 1: HIGH risk (content bleeds across sections)
structural_guidance — 5: headings, paragraphs, ordering fully prescribed; 1: no structure specified
prompt_efficiency — 5: minimal tokens, maximum clarity; 1: verbose and redundant
""".strip()
OUTPUT_QUALITY_SYSTEM = """
You are an expert clinical documentation reviewer. Given a psychiatric prompt and
the AI-generated output it produced, assess output quality across 8 dimensions.
Respond ONLY with a single valid JSON object. No preamble, no explanation, no
markdown fences. Any non-JSON output will be rejected and retried.
Required JSON schema:
{
"transcript_fidelity": <integer 1-5>,
"no_hallucination": <integer 1-5>,
"clinical_tone": <integer 1-5>,
"section_completeness": <integer 1-5>,
"pii_token_preserved": <integer 1-5>,
"no_unsolicited_diagnosis": <integer 1-5>,
"british_english": <integer 1-5>,
"word_count_adherence": <integer 1-5>,
"output_notes": "<one concise sentence — the single most important finding>"
}
Scoring guide (1 = very poor, 5 = excellent):
transcript_fidelity — 5: every claim verifiable from transcript; 1: significant content not traceable
no_hallucination — 5: zero invented facts; 1: clear hallucinations present
clinical_tone — 5: formal third-person psychiatric register; 1: colloquial or inappropriate
section_completeness — 5: all required headings populated; 1: key sections absent
pii_token_preserved — 5: all tokens (e.g. {{LOCATION-1}}) intact; 1: tokens modified or expanded
no_unsolicited_diagnosis — 5: no diagnosis not in transcript; 1: diagnosis introduced by model
british_english — 5: consistent British spelling throughout; 1: American English used systematically
word_count_adherence — 5: within 10% of target; 1: >50% over or under (score 3 if no target stated)
""".strip()
# ═════════════════════════════════════════════════════════════════════════════
# LOGGING
# ═════════════════════════════════════════════════════════════════════════════
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("autodeclare")
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 1: OUTPUT CAPTURE HELPER ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def save_output_for_judge(
prompt_name: str,
output_text: str,
run_id: str,
staging_dir: str = DEFAULT_STAGING,
) -> pathlib.Path:
"""
Call this immediately after each Claude API response in your clinical pipeline.
Writes one JSON file per section per run to the judge staging directory.
Example:
section_text = call_claude_api(prompt)
save_output_for_judge("reason_for_referral", section_text, run_id="2026-02-27_001")
"""
staging = pathlib.Path(staging_dir)
out_dir = staging / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)
safe_name = "".join(c if c.isalnum() or c == "_" else "_" for c in prompt_name)
record = {
"prompt_name": prompt_name,
"run_id": run_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"output": output_text,
}
path = out_dir / f"{run_id}_{safe_name}.json"
path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
return path
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 2: LOAD PROMPT INVENTORY ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def load_inventory(csv_path: pathlib.Path) -> dict:
"""Parse prompt_inventory.csv → dict keyed by Prompt Name."""
if not csv_path.exists():
log.error(f"CSV not found: {csv_path}")
sys.exit(1)
prompts = {}
with open(csv_path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
name = row.get("Prompt Name", "").strip()
if name:
prompts[name] = row
log.info(f"Loaded {len(prompts)} prompts from {csv_path.name}")
return prompts
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 3: OLLAMA PREFLIGHT CHECK ──────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def check_ollama(client, model: str) -> bool:
"""
Verify Ollama is running and the requested model is available.
Prints actionable instructions and returns False if either check fails.
"""
try:
models = client.models.list()
available = [m.id for m in models.data]
except Exception as exc:
print("\n" + "─" * 60)
print(" ERROR: Cannot reach Ollama at", client.base_url)
print(" Start it with: OLLAMA_HOST=0.0.0.0:11434 ollama serve &")
print("─" * 60 + "\n")
return False
if model not in available:
print("\n" + "─" * 60)
print(f" ERROR: Model '{model}' not found in Ollama.")
print(f" Pull it with: ollama pull {model}")
print(f" Available: {', '.join(available) or '(none)'}")
print("─" * 60 + "\n")
return False
log.info(f"Ollama OK — model '{model}' available")
return True
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 4: JUDGE CALLS ─────────────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def _call_judge(
client,
model: str,
system: str,
user_content: str,
required_keys: list,
label: str,
) -> Optional[dict]:
"""Core judge call with retry, JSON validation, and score coercion."""
for attempt in range(1, RETRY_LIMIT + 1):
try:
resp = client.chat.completions.create(
model=model,
temperature=0.0,
max_tokens=512,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user_content},
],
response_format={"type": "json_object"},
)
raw = resp.choices[0].message.content.strip()
# Strip accidental markdown fences
if raw.startswith("```"):
parts = raw.split("```")
raw = parts[1].lstrip("json").strip() if len(parts) > 1 else raw
parsed = json.loads(raw)
# Validate all required keys are present
missing = [k for k in required_keys if k not in parsed]
if missing:
raise ValueError(f"Missing keys: {missing}")
# Coerce dimension scores to int (some models return floats/strings)
notes_keys = {"notes", "output_notes"}
for k in required_keys:
if k not in notes_keys:
parsed[k] = max(1, min(5, int(round(float(parsed[k])))))
return parsed
except Exception as exc:
delay = RETRY_DELAY * (2 ** (attempt - 1))
if attempt < RETRY_LIMIT:
log.warning(f"[{label}] attempt {attempt} failed ({exc}). Retry in {delay:.0f}s")
time.sleep(delay)
else:
log.error(f"[{label}] failed after {RETRY_LIMIT} attempts: {exc}")
return None
def judge_prompt_quality(client, model, name, prompt_text):
"""Judge A — evaluate prompt design in isolation."""
user = (
f"Prompt name: {name}\n\n"
f"--- PROMPT TEXT START ---\n{prompt_text}\n--- PROMPT TEXT END ---"
)
return _call_judge(
client, model,
system=PROMPT_QUALITY_SYSTEM,
user_content=user,
required_keys=PROMPT_DIM_IDS + ["notes"],
label=f"{name} / JudgeA",
)
def judge_output_quality(client, model, name, prompt_text, output_text):
"""Judge B — evaluate generated output against prompt intent."""
user = (
f"Prompt name: {name}\n\n"
f"--- PROMPT TEXT START ---\n{prompt_text}\n--- PROMPT TEXT END ---\n\n"
f"--- GENERATED OUTPUT START ---\n{output_text}\n--- GENERATED OUTPUT END ---"
)
return _call_judge(
client, model,
system=OUTPUT_QUALITY_SYSTEM,
user_content=user,
required_keys=OUTPUT_DIM_IDS + ["output_notes"],
label=f"{name} / JudgeB",
)
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 5: SCORING ORCHESTRATION ───────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
def run_scoring(
run_id: str,
csv_path: pathlib.Path,
staging_dir: pathlib.Path,
model: str,
base_url: str,
prompts_only: bool,
use_judge: bool,
) -> list:
"""
Orchestrate all judge calls and return the complete scored list.
Falls back to BASELINE_SCORES when the judge is unavailable or skipped.
"""
prompts = load_inventory(csv_path)
# Scan for captured output files
outputs_dir = staging_dir / "outputs"
output_files = {}
if not prompts_only and outputs_dir.exists():
for path in sorted(outputs_dir.glob(f"{run_id}_*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
pname = data.get("prompt_name", "")
if pname:
output_files[pname] = data
except Exception as exc:
log.warning(f"Cannot read {path.name}: {exc}")
log.info(f"Found {len(output_files)} output file(s) for run '{run_id}'")
# Initialise Ollama client
client = None
if use_judge:
if not OPENAI_AVAILABLE:
log.warning("openai package not installed — falling back to baseline scores")
use_judge = False
else:
client = OpenAI(base_url=base_url, api_key=DEFAULT_API_KEY)
if not check_ollama(client, model):
log.warning("Ollama check failed — falling back to baseline scores")
use_judge = False
all_scores = []
total = len(prompts)
for idx, (pname, prow) in enumerate(prompts.items(), start=1):
prompt_text = prow.get("Full Prompt Content", "").strip()
assessment_type = prow.get("Assessment Type", "")
group = ASSESSMENT_TO_GROUP.get(assessment_type.strip(), assessment_type.strip())
label = prow.get("Prompt Name", pname.replace("_", " ").title())
log.info(f"[{idx}/{total}] {pname}")
# ── Prompt quality ────────────────────────────────────────────────────
if not prompt_text:
# Utility entries (remove_leading_tabs) — use baseline if available
pq = BASELINE_SCORES.get(pname)
log.info(f" Utility entry — using baseline")
elif use_judge:
log.info(f" → Judge A (prompt quality)...")
pq = judge_prompt_quality(client, model, pname, prompt_text)
if pq is None:
log.warning(f" Judge A failed — falling back to baseline")
pq = BASELINE_SCORES.get(pname)
time.sleep(INTER_CALL_DELAY)
else:
pq = BASELINE_SCORES.get(pname)
log.info(f" Using baseline prompt scores")
# ── Output quality ────────────────────────────────────────────────────
oq = None
has_output = pname in output_files
if prompts_only:
pass # skip Judge B entirely
elif has_output and use_judge and prompt_text:
output_text = output_files[pname].get("output", "").strip()
if output_text:
log.info(f" → Judge B (output quality)...")
oq = judge_output_quality(client, model, pname, prompt_text, output_text)
if oq is None:
log.warning(f" Judge B failed — no output quality score recorded")
time.sleep(INTER_CALL_DELAY)
else:
log.warning(f" Output file has empty 'output' field — skipping Judge B")
elif has_output and not use_judge:
log.info(f" Output file exists but judge disabled — skipping Judge B")
all_scores.append({
"name": pname,
"label": label,
"group": group,
"run_id": run_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"has_output": has_output and not prompts_only,
"is_utility": not bool(prompt_text),
"score_source": "judge" if (use_judge and pq is not None and prompt_text) else "baseline",
"prompt_quality": pq,
"output_quality": oq,
})
# Write scores.json
scores_dir = staging_dir / "scores"
scores_dir.mkdir(parents=True, exist_ok=True)
scores_path = scores_dir / f"{run_id}_scores.json"
scores_path.write_text(
json.dumps(all_scores, indent=2, ensure_ascii=False), encoding="utf-8"
)
judged = sum(1 for s in all_scores if s["score_source"] == "judge")
baseline = sum(1 for s in all_scores if s["score_source"] == "baseline")
with_oq = sum(1 for s in all_scores if s["output_quality"] is not None)
log.info(f"Scores: {judged} judge | {baseline} baseline | {with_oq} with output quality")
log.info(f"Saved → {scores_path}")
return all_scores
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 6: DASHBOARD GENERATION ────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
# ── Colour helpers ────────────────────────────────────────────────────────────
def score_color(val: float) -> str:
return SCORE_PAL[max(0, min(4, round(val) - 1))]
def hex_to_rgb(h: str) -> tuple:
h = h.lstrip("#")
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
def adjusted_avg(row: dict, dim_ids: list) -> float:
total = sum((6 - row[d]) if d == "duplication_risk" else row[d] for d in dim_ids)
return round(total / len(dim_ids), 2)
def fig_div(fig) -> str:
return fig.to_html(full_html=False, include_plotlyjs=False)
# ── Chart builders ────────────────────────────────────────────────────────────
def build_heatmap(df: pd.DataFrame, dim_ids: list, dim_labels: list, title: str):
z = df[dim_ids].values.tolist()
labels = df["label"].tolist()
fig = go.Figure(go.Heatmap(
z=z, x=dim_labels, y=labels,
text=[[str(v) for v in row] for row in z],
texttemplate="%{text}",
textfont=dict(size=11, color="white", family="monospace"),
colorscale=COLORSCALE, zmin=1, zmax=5,
colorbar=dict(
title=dict(text="Score", font=dict(color=TXT_SUB)),
tickfont=dict(color=TXT_SUB, family="monospace"),
bgcolor=CARD_BG, bordercolor=BORDER, thickness=14,
),
hovertemplate="<b>%{y}</b><br>%{x}: %{z}/5<extra></extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=10),
xaxis=dict(tickangle=-35, showgrid=False, tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=10, color=TXT_MAIN), autorange="reversed"),
margin=dict(l=310, r=80, t=60, b=140), height=680,
)
return fig
def build_bar(df: pd.DataFrame, title: str):
df_s = df.sort_values("adj_avg")
fig = go.Figure(go.Bar(
x=df_s["adj_avg"], y=df_s["label"], orientation="h",
marker=dict(
color=[GROUP_COLORS.get(g, "#6b7280") for g in df_s["group"]],
line=dict(width=0),
),
text=df_s["adj_avg"].astype(str), textposition="outside",
textfont=dict(color=TXT_SUB, size=10, family="monospace"),
hovertemplate="<b>%{y}</b><br>Adj. avg: %{x}<extra></extra>",
))
for grp, col in GROUP_COLORS.items():
fig.add_trace(go.Scatter(x=[None], y=[None], mode="markers",
marker=dict(size=10, color=col), name=grp))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=10),
xaxis=dict(range=[0, 5.8], showgrid=True, gridcolor=BORDER,
tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=10, color=TXT_MAIN)),
legend=dict(bgcolor=CARD_BG, bordercolor=BORDER, borderwidth=1,
font=dict(color=TXT_SUB, size=10)),
margin=dict(l=310, r=60, t=60, b=40), height=620,
)
return fig
def build_dim_averages(df: pd.DataFrame, dimensions: list, title: str):
dim_avgs = []
for dim, lbl in dimensions:
avg_val = round((6 - df[dim]).mean() if dim == "duplication_risk" else df[dim].mean(), 2)
dim_avgs.append((lbl + (" (inv.)" if dim == "duplication_risk" else ""), avg_val))
dim_avgs.sort(key=lambda x: x[1])
fig = go.Figure(go.Bar(
x=[v for _, v in dim_avgs], y=[n for n, _ in dim_avgs], orientation="h",
marker=dict(color=[score_color(v) for _, v in dim_avgs], line=dict(width=0)),
text=[str(v) for _, v in dim_avgs], textposition="outside",
textfont=dict(color=TXT_SUB, size=11, family="monospace"),
hovertemplate="<b>%{y}</b><br>Average: %{x}<extra></extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
plot_bgcolor=CARD_BG, paper_bgcolor=CARD_BG,
font=dict(color=TXT_SUB, family="monospace", size=11),
xaxis=dict(range=[0, 5.6], showgrid=True, gridcolor=BORDER,
tickfont=dict(size=10, color=TXT_SUB)),
yaxis=dict(showgrid=False, tickfont=dict(size=11, color=TXT_MAIN)),
margin=dict(l=240, r=60, t=60, b=40), height=400,
)
return fig
def build_radar_overlay(df: pd.DataFrame, dim_ids: list, dim_labels: list, title: str):
theta_closed = dim_labels + [dim_labels[0]]
fig = go.Figure()
for grp, col in GROUP_COLORS.items():
gdf = df[df["group"] == grp]
if gdf.empty:
continue
scores = [round(gdf[d].mean(), 2) for d in dim_ids]
r_c = scores + [scores[0]]
r, g, b = hex_to_rgb(col)
fig.add_trace(go.Scatterpolar(
r=r_c, theta=theta_closed, fill="toself",
fillcolor=f"rgba({r},{g},{b},0.12)", line=dict(color=col, width=2),
name=grp,
hovertemplate="%{theta}: %{r:.2f}<extra>" + grp + "</extra>",
))
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
polar=dict(
bgcolor="#0d1117",
radialaxis=dict(visible=True, range=[0, 5], showticklabels=False,
gridcolor=BORDER, linecolor=BORDER),
angularaxis=dict(tickfont=dict(color=TXT_SUB, size=10, family="monospace"),
gridcolor=BORDER, linecolor=BORDER),
),
paper_bgcolor=CARD_BG,
legend=dict(bgcolor=CARD_BG, bordercolor=BORDER, borderwidth=1,
font=dict(color=TXT_SUB, size=11)),
margin=dict(l=60, r=60, t=60, b=60), height=500,
)
return fig
def build_radar_grid(prompts_list: list, dim_ids: list, dim_labels: list, title: str):
theta_closed = dim_labels + [dim_labels[0]]
COLS = 4
N = len(prompts_list)
ROWS = -(-N // COLS)
specs = [[{"type": "polar"}] * COLS for _ in range(ROWS)]
titles = [p["label"] for p in prompts_list] + [""] * (ROWS * COLS - N)
fig = make_subplots(rows=ROWS, cols=COLS, specs=specs,
subplot_titles=titles,
vertical_spacing=0.08, horizontal_spacing=0.04)
for i, pd_ in enumerate(prompts_list):
ri = i // COLS + 1
ci = i % COLS + 1
scores = [pd_[d] for d in dim_ids]
r_c = scores + [scores[0]]
col = GROUP_COLORS.get(pd_["group"], "#6b7280")
r, g, b = hex_to_rgb(col)
fig.add_trace(go.Scatterpolar(
r=r_c, theta=theta_closed, fill="toself",
fillcolor=f"rgba({r},{g},{b},0.18)", line=dict(color=col, width=1.5),
name=pd_["label"], showlegend=False,
hovertemplate="%{theta}: %{r}<extra>" + pd_["label"] + "</extra>",
), row=ri, col=ci)
for i in range(1, ROWS * COLS + 1):
key = f"polar{i}" if i > 1 else "polar"
fig.update_layout(**{key: dict(
bgcolor="#0d1117",
radialaxis=dict(visible=True, range=[0, 5], showticklabels=False,
gridcolor=BORDER, linecolor=BORDER),
angularaxis=dict(tickfont=dict(color=TXT_DIM, size=7),
gridcolor=BORDER, linecolor=BORDER),
)})
for ann in fig.layout.annotations:
ann.font = dict(color=TXT_SUB, size=9, family="monospace")
fig.update_layout(
title=dict(text=title, font=dict(color=TXT_MAIN, size=15, family="Georgia"), x=0.01),
paper_bgcolor=CARD_BG, height=ROWS * 200 + 80,
margin=dict(l=20, r=20, t=60, b=20),
)
return fig
def build_notes_table(df: pd.DataFrame, notes_col: str, html_border: str) -> str:
rows_html = ""
for _, row in df.sort_values("adj_avg").iterrows():
col = GROUP_COLORS.get(row["group"], "#6b7280")
avg = row["adj_avg"]
acol = score_color(avg)
note = row.get(notes_col, "") or ""
rows_html += f"""
<tr>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};">
<span style="font-size:10px;color:{col};font-family:monospace;
text-transform:uppercase;display:block;margin-bottom:3px;">{row['group']}</span>
<span style="font-size:12px;color:{TXT_MAIN};">{row['label']}</span>
</td>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};font-size:18px;
font-weight:bold;font-family:monospace;color:{acol};text-align:center;">{avg}</td>
<td style="padding:10px 12px;border-bottom:1px solid {html_border};font-size:12px;
color:{TXT_SUB};line-height:1.6;font-family:'Helvetica Neue',sans-serif;">{note}</td>
</tr>"""
return rows_html
def build_score_source_badge(source: str) -> str:
if source == "judge":
return f'<span style="background:#064e3b;color:#6ee7b7;font-size:10px;font-family:monospace;padding:2px 8px;border-radius:4px;">judge</span>'
return f'<span style="background:#1e3a5f;color:#93c5fd;font-size:10px;font-family:monospace;padding:2px 8px;border-radius:4px;">baseline</span>'
def generate_dashboard(all_scores: list, output_path: pathlib.Path, run_id: str) -> None:
"""Build the complete self-contained HTML dashboard from scored data."""
# ── Prompt quality DataFrame (all entries with scores) ───────────────────
pq_rows = []
for s in all_scores:
pq = s.get("prompt_quality")
if pq and all(k in pq for k in PROMPT_DIM_IDS):
row = {
"name": s["name"],
"label": s["label"],
"group": s["group"],
"source": s.get("score_source", "baseline"),
"notes": pq.get("notes", ""),
}
for dim in PROMPT_DIM_IDS:
row[dim] = pq[dim]
row["adj_avg"] = adjusted_avg(row, PROMPT_DIM_IDS)
pq_rows.append(row)
# ── Output quality DataFrame (only entries with oq scores) ───────────────
oq_rows = []
for s in all_scores:
oq = s.get("output_quality")
if oq and all(k in oq for k in OUTPUT_DIM_IDS):
row = {
"name": s["name"],
"label": s["label"],
"group": s["group"],
"notes": oq.get("output_notes", ""),
}
for dim in OUTPUT_DIM_IDS:
row[dim] = oq[dim]
row["adj_avg"] = adjusted_avg(row, OUTPUT_DIM_IDS)
oq_rows.append(row)
if not pq_rows:
log.error("No scoreable prompts found — cannot generate dashboard")
return
df_pq = pd.DataFrame(pq_rows)
has_oq = bool(oq_rows)
df_oq = pd.DataFrame(oq_rows) if has_oq else None
n_prompts = len(df_pq)
n_judged = sum(1 for s in all_scores if s.get("score_source") == "judge")
# ── Build all charts ──────────────────────────────────────────────────────
log.info("Generating charts...")
# Prompt quality charts (original set)
fig_pq_heatmap = build_heatmap(df_pq, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Score Heatmap")
fig_pq_bar = build_bar(df_pq, "Prompt Quality — Adjusted Average by Prompt")
fig_pq_dims = build_dim_averages(df_pq, PROMPT_DIMS,
f"Prompt Quality — Dimension Averages ({n_prompts} Prompts)")
fig_pq_radar = build_radar_overlay(df_pq, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Group Radar Overlay")
fig_pq_grid = build_radar_grid(pq_rows, PROMPT_DIM_IDS, PROMPT_DIM_LABELS,
"Prompt Quality — Individual Radar Grid")
pq_notes_html = build_notes_table(df_pq, "notes", BORDER)
# Output quality charts (new section — only if data exists)
oq_section = ""
if has_oq:
fig_oq_heatmap = build_heatmap(df_oq, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Score Heatmap")
fig_oq_bar = build_bar(df_oq, "Output Quality — Adjusted Average by Prompt")
fig_oq_dims = build_dim_averages(df_oq, OUTPUT_DIMS,
f"Output Quality — Dimension Averages ({len(oq_rows)} Prompts)")
fig_oq_radar = build_radar_overlay(df_oq, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Group Radar Overlay")
fig_oq_grid = build_radar_grid(oq_rows, OUTPUT_DIM_IDS, OUTPUT_DIM_LABELS,
"Output Quality — Individual Radar Grid")
oq_notes_html = build_notes_table(df_oq, "notes", BORDER)
oq_section = f"""
<div class="section-header">
<h2 class="section-title">Output Quality</h2>
<p class="section-sub">Assessed by Judge B against transcript fidelity, clinical tone, PII preservation, and more.
<span style="color:{TXT_DIM};font-size:11px;"> · {len(oq_rows)} outputs scored</span>
</p>
</div>
<div class="card"><h2 class="st">Output Quality — Score Heatmap</h2>{fig_div(fig_oq_heatmap)}</div>
<div class="two-col">
<div class="card"><h2 class="st">Adjusted Average by Prompt</h2>{fig_div(fig_oq_bar)}</div>
<div class="card"><h2 class="st">Dimension Averages</h2>{fig_div(fig_oq_dims)}</div>
</div>
<div class="two-col">
<div class="card"><h2 class="st">Group Radar Overlay</h2>{fig_div(fig_oq_radar)}</div>
<div class="card"><h2 class="st">Individual Radar Grid</h2>{fig_div(fig_oq_grid)}</div>
</div>
<div class="card">
<h2 class="st">Output Quality — Reviewer Notes</h2>
<table>
<thead><tr>
<th style="width:240px;">Prompt</th>
<th style="width:80px;text-align:center;">Adj. Avg</th>
<th>Notes</th>
</tr></thead>
<tbody>{oq_notes_html}</tbody>
</table>
</div>"""
# ── Score source legend for prompt quality notes table ────────────────────
source_col_html = ""
for _, row in df_pq.sort_values("adj_avg").iterrows():
badge = build_score_source_badge(row.get("source", "baseline"))
source_col_html += f"""
<tr><td style="padding:6px 12px;border-bottom:1px solid {BORDER};
font-size:11px;color:{TXT_SUB};font-family:monospace;">{row['label']}</td>
<td style="padding:6px 12px;border-bottom:1px solid {BORDER};">{badge}</td></tr>"""
# ── Load bundled Plotly JS ────────────────────────────────────────────────
plotly_js_path = os.path.join(
os.path.dirname(plotly.__file__), "package_data", "plotly.min.js"
)
with open(plotly_js_path, "r", encoding="utf-8") as f:
plotly_js = f.read()
# ── Assemble HTML ─────────────────────────────────────────────────────────
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AutoDeclare — Rubric Dashboard</title>
<script>{plotly_js}</script>
<style>
*,*::before,*::after{{box-sizing:border-box;margin:0;padding:0;}}
body {{background:{DARK_BG};color:{TXT_MAIN};font-family:Georgia,serif;}}
header {{background:#1a1f2e;border-bottom:1px solid {BORDER};padding:28px 40px;}}
header p.eyebrow {{color:{TXT_DIM};font-size:11px;letter-spacing:.2em;
font-family:monospace;text-transform:uppercase;margin-bottom:6px;}}
header h1 {{font-size:28px;font-weight:normal;color:{TXT_MAIN};margin-bottom:6px;}}
header p.sub {{color:{TXT_DIM};font-size:12px;font-family:'Helvetica Neue',sans-serif;}}
main {{max-width:1300px;margin:0 auto;padding:32px 40px;}}
.card {{background:{CARD_BG};border:1px solid {BORDER};border-radius:10px;
padding:20px;margin-bottom:24px;}}
h2.st {{font-size:13px;font-weight:normal;color:{TXT_SUB};font-family:monospace;
letter-spacing:.1em;text-transform:uppercase;margin-bottom:16px;}}
.two-col {{display:grid;grid-template-columns:1fr 1fr;gap:24px;margin-bottom:24px;}}
.section-header {{margin:40px 0 20px;padding-bottom:12px;
border-bottom:2px solid {BORDER};}}
.section-title {{font-size:11px;font-family:monospace;letter-spacing:.2em;
text-transform:uppercase;color:#64748b;margin-bottom:6px;}}
.section-sub {{font-size:13px;color:{TXT_DIM};font-family:'Helvetica Neue',sans-serif;}}
table {{width:100%;border-collapse:collapse;}}
th {{text-align:left;padding:8px 12px;color:{TXT_DIM};font-size:11px;
font-family:monospace;letter-spacing:.05em;border-bottom:2px solid {BORDER};}}
.meta-grid {{display:grid;grid-template-columns:repeat(4,1fr);gap:12px;margin-bottom:24px;}}
.meta-card {{background:{CARD_BG};border:1px solid {BORDER};border-radius:8px;
padding:14px 16px;}}
.meta-card .label {{font-size:10px;font-family:monospace;letter-spacing:.12em;
text-transform:uppercase;color:{TXT_DIM};margin-bottom:6px;}}
.meta-card .value {{font-size:20px;font-family:monospace;color:{TXT_MAIN};}}
@media(max-width:900px){{.two-col{{grid-template-columns:1fr;}}
.meta-grid{{grid-template-columns:repeat(2,1fr);}}main{{padding:16px;}}}}
</style>
</head>
<body>
<header>
<p class="eyebrow">AutoDeclare · AI-as-Judge Pipeline</p>
<h1>Rubric Dashboard</h1>
<p class="sub">Run: <code style="color:{TXT_MAIN};">{run_id}</code>
· {n_prompts} prompts
· {n_judged} judge-scored
· {len(oq_rows) if has_oq else 0} outputs assessed
· * Duplication Risk is inverted — lower score = lower risk = better
</p>
</header>
<main>
<!-- ── Meta summary cards ── -->
<div class="meta-grid">
<div class="meta-card">
<div class="label">Total Prompts</div>
<div class="value">{n_prompts}</div>
</div>
<div class="meta-card">
<div class="label">Judge-Scored</div>
<div class="value" style="color:#10b981;">{n_judged}</div>
</div>
<div class="meta-card">
<div class="label">Baseline-Seeded</div>
<div class="value" style="color:#3b82f6;">{n_prompts - n_judged}</div>
</div>
<div class="meta-card">
<div class="label">Outputs Assessed</div>
<div class="value" style="color:#f59e0b;">{len(oq_rows) if has_oq else 0}</div>
</div>
</div>
<!-- ══ PROMPT QUALITY SECTION ══ -->
<div class="section-header">
<h2 class="section-title">Prompt Quality</h2>
<p class="section-sub">Assessed by Judge A — prompt design evaluated in isolation across 8 rubric dimensions.</p>
</div>
<div class="card"><h2 class="st">Score Heatmap — All Prompts × All Dimensions</h2>{fig_div(fig_pq_heatmap)}</div>
<div class="two-col">
<div class="card"><h2 class="st">Adjusted Average by Prompt</h2>{fig_div(fig_pq_bar)}</div>
<div class="card"><h2 class="st">Dimension Averages — All Prompts</h2>{fig_div(fig_pq_dims)}</div>
</div>
<div class="two-col">
<div class="card"><h2 class="st">Group Radar Overlay</h2>{fig_div(fig_pq_radar)}</div>
<div class="card"><h2 class="st">Individual Prompt Radar Grid</h2>{fig_div(fig_pq_grid)}</div>
</div>
<div class="card">
<h2 class="st">Reviewer Notes</h2>
<table>
<thead><tr>
<th style="width:240px;">Prompt</th>
<th style="width:80px;text-align:center;">Adj. Avg</th>
<th>Notes</th>
</tr></thead>
<tbody>{pq_notes_html}</tbody>
</table>
</div>
<!-- ── Score source legend ── -->
<div class="card">
<h2 class="st">Score Sources</h2>
<p style="font-size:13px;color:{TXT_DIM};margin-bottom:12px;">
Indicates whether each prompt's score came from the live Ollama judge or the
hardcoded baseline. Run with <code>--run-id</code> and Ollama available to replace baseline entries.
</p>
<table style="max-width:500px;">
<thead><tr><th>Prompt</th><th>Source</th></tr></thead>
<tbody>{source_col_html}</tbody>
</table>
</div>
<!-- ══ OUTPUT QUALITY SECTION ══ -->
{oq_section if has_oq else f'''
<div class="section-header">
<h2 class="section-title">Output Quality</h2>
<p class="section-sub" style="color:{TXT_DIM};">
No output files found for run <code>{run_id}</code>.
Add <code>save_output_for_judge()</code> calls to your pipeline and re-run
with <code>--run-id {run_id}</code> to populate this section.
</p>
</div>'''}
</main>
</body>
</html>"""
output_path.write_text(html, encoding="utf-8")
size_kb = output_path.stat().st_size // 1024
log.info(f"Dashboard written → {output_path} ({size_kb} KB)")
# ═════════════════════════════════════════════════════════════════════════════
# ── PHASE 7: SSH TUNNEL SERVER ────────────────────────────────────────────────
# ═════════════════════════════════════════════════════════════════════════════
class _QuietHandler(http.server.SimpleHTTPRequestHandler):
"""Minimal HTTP handler — suppresses request logs, serves from html directory."""
def __init__(self, *args, directory=None, **kwargs):
super().__init__(*args, directory=directory, **kwargs)
def log_message(self, fmt, *args):
pass # silence per-request logs
def serve_dashboard(output_path: pathlib.Path, port: int) -> None:
"""
Bind an HTTP server to 127.0.0.1 only (never 0.0.0.0) and print the
SSH tunnel command required to access it from a remote machine.
"""
directory = str(output_path.parent.resolve())
filename = output_path.name
url = f"http://127.0.0.1:{port}/{filename}"
def handler_factory(*args, **kwargs):
return _QuietHandler(*args, directory=directory, **kwargs)
socketserver.TCPServer.allow_reuse_address = True
try:
httpd = socketserver.TCPServer(("127.0.0.1", port), handler_factory)
except OSError as exc:
log.error(f"Cannot bind to port {port}: {exc}")
log.error(f"Try: python autodeclare.py --port {port + 1}")
return
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
thread.start()
ec2_ip = "<your-ec2-public-ip>"
print()
print("─" * 64)
print(" AutoDeclare Dashboard — Ready")
print("─" * 64)
print()
print(" The server is bound to 127.0.0.1 only (SSH tunnel required).")
print()
print(" Run this command on your LOCAL machine:")
print()
print(f" ssh -L {port}:127.0.0.1:{port} ubuntu@{ec2_ip}")
print()
print(" Then open this URL in your local browser:")
print()
print(f" {url}")
print()
print(" Press Ctrl+C to stop the server.")
print("─" * 64)
print()
try:
signal.pause()
except (AttributeError, OSError):
try:
input()
except (EOFError, KeyboardInterrupt):
pass
finally:
httpd.shutdown()
log.info("Server stopped.")
# ═════════════════════════════════════════════════════════════════════════════
# CLI
# ═════════════════════════════════════════════════════════════════════════════
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="AutoDeclare — AI-as-Judge pipeline + Rubric Dashboard",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python autodeclare.py --run-id 2026-02-27_001
python autodeclare.py --prompts-only
python autodeclare.py --dashboard-only --run-id 2026-02-27_001
python autodeclare.py --no-serve --run-id 2026-02-27_001
python autodeclare.py --run-id 2026-02-27_001 --model mistral-small
""",
)
p.add_argument("--run-id", default="latest",
help="Run ID for output file matching (default: 'latest')")
p.add_argument("--csv", default=DEFAULT_CSV,
help=f"Path to prompt_inventory.csv (default: {DEFAULT_CSV})")
p.add_argument("--staging", default=DEFAULT_STAGING,
help=f"Staging directory root (default: {DEFAULT_STAGING})")
p.add_argument("--model", default=DEFAULT_MODEL,
help=f"Ollama model name (default: {DEFAULT_MODEL})")
p.add_argument("--base-url", default=DEFAULT_BASE_URL,
help=f"Ollama base URL (default: {DEFAULT_BASE_URL})")
p.add_argument("--port", default=DEFAULT_PORT, type=int,
help=f"Dashboard server port (default: {DEFAULT_PORT})")
p.add_argument("--prompts-only", action="store_true",
help="Score prompts only — skip Judge B")
p.add_argument("--dashboard-only",action="store_true",
help="Skip judging — rebuild dashboard from existing scores.json")
p.add_argument("--no-serve", action="store_true",
help="Write HTML and exit — do not start the server")
p.add_argument("--no-judge", action="store_true",
help="Use baseline scores only — do not call Ollama")
return p.parse_args()
def main() -> None:
args = parse_args()
staging_dir = pathlib.Path(args.staging)
staging_dir.mkdir(parents=True, exist_ok=True)
output_path = pathlib.Path(OUTPUT_HTML)
# ── Dashboard-only mode: load existing scores.json ────────────────────────
if args.dashboard_only:
scores_path = staging_dir / "scores" / f"{args.run_id}_scores.json"
if not scores_path.exists():
log.error(f"Scores file not found: {scores_path}")
log.error("Run without --dashboard-only first to generate scores.")
sys.exit(1)
all_scores = json.loads(scores_path.read_text(encoding="utf-8"))
log.info(f"Loaded {len(all_scores)} entries from {scores_path.name}")
# ── Full pipeline: judge + score ──────────────────────────────────────────
else:
use_judge = not args.no_judge
all_scores = run_scoring(
run_id = args.run_id,
csv_path = pathlib.Path(args.csv),
staging_dir = staging_dir,
model = args.model,
base_url = args.base_url,
prompts_only= args.prompts_only,
use_judge = use_judge,
)
# ── Generate dashboard ────────────────────────────────────────────────────
generate_dashboard(all_scores, output_path, args.run_id)
# ── Serve (unless --no-serve) ─────────────────────────────────────────────
if not args.no_serve:
serve_dashboard(output_path, args.port)
if __name__ == "__main__":
main()