From d8fcbdf7c6f6243d26ead5d59eac6709b9270f8d Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach Date: Tue, 23 Jun 2026 21:50:58 +0000 Subject: [PATCH 1/3] Add status and public/secret scores to /user/submissions list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The list endpoint (`db.get_user_submissions`) returned only the public, ranking-filtered run score per submission — it dropped the secret leaderboard score and any failure signal entirely. So a CLI/webapp listing could not show the secret (ranking) score, and a submission with a failed run was indistinguishable from a successful one (both showed as "done" with whatever public score survived the anti-cheat filter). Surfacing these required an extra detail fetch per row client-side. Add them to the list payload directly: - `status`: "pending" (not done), "failed" (done with any failed run), or "done". Mirrors the canonical done/failed logic in get_hf_export_rows.sql. - `public_score` / `secret_score`: the geomean leaderboard scores. The public score keeps the existing ranking-eligibility filter; the secret score is the owner's passing secret leaderboard run (owners already see this via the detail endpoint). The existing `runs`/`done` fields are unchanged for backward compat. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/libkernelbot/leaderboard_db.py | 51 +++++++++++++++++++++++-- tests/test_leaderboard_db.py | 60 ++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 3 deletions(-) diff --git a/src/libkernelbot/leaderboard_db.py b/src/libkernelbot/leaderboard_db.py index 9464f861..e4172227 100644 --- a/src/libkernelbot/leaderboard_db.py +++ b/src/libkernelbot/leaderboard_db.py @@ -1257,7 +1257,10 @@ def get_user_submissions( offset: Offset for pagination Returns: - List of submission dictionaries with summary info and runs + List of submission dictionaries with summary info and runs. Each + entry includes ``status`` ("pending"/"failed"/"done"), + ``public_score`` and ``secret_score`` (the geomean leaderboard + scores, either may be ``None``), plus the public ``runs`` list. """ # Validate and clamp inputs limit = max(1, min(limit, 100)) @@ -1325,17 +1328,59 @@ def get_user_submissions( "score": run_row[2], }) + # Per-submission status and leaderboard scores. The public `runs` + # above are ranking-filtered (anti-cheat: the public score is hidden + # unless the matching secret run passed). Here we additionally + # surface the secret leaderboard score (visible to the owner, as the + # detail endpoint already does) and whether any run failed, so + # callers can show an accurate status and both scores without an + # extra request per submission. + agg_query = """ + SELECT submission_id, + MIN(score) FILTER ( + WHERE mode = 'leaderboard' AND secret AND passed + ) AS secret_score, + bool_or(NOT passed) AS has_failed_run + FROM leaderboard.runs + WHERE submission_id = ANY(%s) + GROUP BY submission_id + """ + self.cursor.execute(agg_query, (submission_ids,)) + agg_by_submission: dict = { + row[0]: {"secret_score": row[1], "has_failed_run": row[2]} + for row in self.cursor.fetchall() + } + # Build result with runs grouped by submission results = [] for row in submissions: sub_id = row[0] + done = row[4] + public_runs = runs_by_submission.get(sub_id, []) + agg = agg_by_submission.get(sub_id, {}) + + # The public leaderboard score (lowest across GPUs), already + # ranking-eligible by construction of `runs_query`. + public_scores = [r["score"] for r in public_runs if r["score"] is not None] + public_score = min(public_scores) if public_scores else None + + if not done: + status = "pending" + elif agg.get("has_failed_run"): + status = "failed" + else: + status = "done" + results.append({ "id": sub_id, "leaderboard_name": row[1], "file_name": row[2], "submission_time": row[3], - "done": row[4], - "runs": runs_by_submission.get(sub_id, []), + "done": done, + "status": status, + "public_score": public_score, + "secret_score": agg.get("secret_score"), + "runs": public_runs, }) return results except psycopg2.Error as e: diff --git a/tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py index 621b7391..66ac0deb 100644 --- a/tests/test_leaderboard_db.py +++ b/tests/test_leaderboard_db.py @@ -1066,6 +1066,66 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard): assert 2.0 in scores +def test_get_user_submissions_status_and_scores_on_success(database, submit_leaderboard): + """A fully-passing submission reports status 'done' with public+secret scores.""" + with database as db: + sub = db.create_submission( + "submit-leaderboard", "ok.py", 5, "code", + datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5", + ) + _create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5) + _create_submission_run(db, sub, mode="leaderboard", secret=True, runner="A100", score=1.7) + db.mark_submission_done(sub) + + result = db.get_user_submissions(user_id="5") + assert len(result) == 1 + assert result[0]["status"] == "done" + # Scores come back as Decimal from Postgres (as the existing `runs` + # score does); compare as float. + assert float(result[0]["public_score"]) == 1.5 + assert float(result[0]["secret_score"]) == 1.7 + + +def test_get_user_submissions_status_failed_when_run_failed(database, submit_leaderboard): + """A submission with a failed run reports status 'failed'.""" + failed = dataclasses.replace(sample_run_result(), passed=False) + with database as db: + sub = db.create_submission( + "submit-leaderboard", "bad.py", 5, "code", + datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5", + ) + _create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5) + _create_submission_run( + db, sub, mode="leaderboard", secret=True, runner="A100", + score=None, result=failed, + ) + db.mark_submission_done(sub) + + result = db.get_user_submissions(user_id="5") + assert len(result) == 1 + assert result[0]["status"] == "failed" + # Ranking-eligibility (and thus the public score / runs) is withheld + # when the secret run failed, but the secret score stays None too. + assert result[0]["public_score"] is None + assert result[0]["secret_score"] is None + assert result[0]["runs"] == [] + + +def test_get_user_submissions_status_pending_when_not_done(database, submit_leaderboard): + """A not-yet-finished submission reports status 'pending'.""" + with database as db: + sub = db.create_submission( + "submit-leaderboard", "wip.py", 5, "code", + datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5", + ) + _create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5) + # Not marked done. + + result = db.get_user_submissions(user_id="5") + assert len(result) == 1 + assert result[0]["status"] == "pending" + + def test_check_leaderboard_access_public(database, submit_leaderboard): """Public leaderboards grant access to everyone.""" with database as db: From 08d26040cbd3284c6a99375abe2d3d7a3843cc37 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach Date: Tue, 23 Jun 2026 22:19:58 +0000 Subject: [PATCH 2/3] Address review: drop redundant public_score, clarify status/secret_score Per review feedback: - Drop the new `public_score` field. The public leaderboard score is already exposed per-run in `runs[].score`; a separate top-level field was redundant. The list payload now only adds what was genuinely missing: `status` and `secret_score`. - Expand the comment to explain why these two (and not the public score) need new code, and why MIN(score) (best secret run across GPUs, to match how the public score is summarized). - Tests: assert the public score via runs[].score; add a case showing a failed *public* run keeps its passing runs (status 'failed', runs non-empty), distinct from a failed *secret* run (full anti-cheat hide, runs []). Aggregate query measured at ~0.25 ms for a full 100-submission page (600 runs), ~1% of the ~21 ms get_user_submissions call. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/libkernelbot/leaderboard_db.py | 38 +++++++++++----------- tests/test_leaderboard_db.py | 52 ++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/src/libkernelbot/leaderboard_db.py b/src/libkernelbot/leaderboard_db.py index e4172227..7884dbb6 100644 --- a/src/libkernelbot/leaderboard_db.py +++ b/src/libkernelbot/leaderboard_db.py @@ -1258,9 +1258,10 @@ def get_user_submissions( Returns: List of submission dictionaries with summary info and runs. Each - entry includes ``status`` ("pending"/"failed"/"done"), - ``public_score`` and ``secret_score`` (the geomean leaderboard - scores, either may be ``None``), plus the public ``runs`` list. + entry includes ``status`` ("pending"/"failed"/"done") and + ``secret_score`` (the secret leaderboard geomean score, the ranking + metric; ``None`` if absent). The public leaderboard score remains + available per-run in ``runs[].score``. """ # Validate and clamp inputs limit = max(1, min(limit, 100)) @@ -1328,13 +1329,21 @@ def get_user_submissions( "score": run_row[2], }) - # Per-submission status and leaderboard scores. The public `runs` - # above are ranking-filtered (anti-cheat: the public score is hidden - # unless the matching secret run passed). Here we additionally - # surface the secret leaderboard score (visible to the owner, as the - # detail endpoint already does) and whether any run failed, so - # callers can show an accurate status and both scores without an - # extra request per submission. + # Per-submission status + secret score. The `runs` above already + # carry the public leaderboard score (in runs[].score), but they are + # ranking-filtered (anti-cheat: only public runs whose matching + # secret run passed) and never include secret runs, so two things + # are not derivable from them: + # - secret_score: the secret leaderboard run's score (the actual + # ranking metric). Visible to the owner, as the detail endpoint + # already exposes it; the list endpoint just never selected it. + # - whether any run failed, so a finished-but-failed submission can + # be told apart from a clean one (both otherwise look "done"). + # One extra aggregate over the same runs rows (keyed by + # submission_id, like runs_query) avoids an N+1 detail fetch per row. + # + # MIN(score): a submission can have a secret leaderboard run per GPU; + # take the best (lowest) to match how the public score is summarized. agg_query = """ SELECT submission_id, MIN(score) FILTER ( @@ -1356,14 +1365,8 @@ def get_user_submissions( for row in submissions: sub_id = row[0] done = row[4] - public_runs = runs_by_submission.get(sub_id, []) agg = agg_by_submission.get(sub_id, {}) - # The public leaderboard score (lowest across GPUs), already - # ranking-eligible by construction of `runs_query`. - public_scores = [r["score"] for r in public_runs if r["score"] is not None] - public_score = min(public_scores) if public_scores else None - if not done: status = "pending" elif agg.get("has_failed_run"): @@ -1378,9 +1381,8 @@ def get_user_submissions( "submission_time": row[3], "done": done, "status": status, - "public_score": public_score, "secret_score": agg.get("secret_score"), - "runs": public_runs, + "runs": runs_by_submission.get(sub_id, []), }) return results except psycopg2.Error as e: diff --git a/tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py index 66ac0deb..f2d8ea1a 100644 --- a/tests/test_leaderboard_db.py +++ b/tests/test_leaderboard_db.py @@ -1066,8 +1066,11 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard): assert 2.0 in scores -def test_get_user_submissions_status_and_scores_on_success(database, submit_leaderboard): - """A fully-passing submission reports status 'done' with public+secret scores.""" +def test_get_user_submissions_status_and_secret_score_on_success(database, submit_leaderboard): + """A fully-passing submission reports status 'done' with the secret score. + + The public score stays where it already was (runs[].score). + """ with database as db: sub = db.create_submission( "submit-leaderboard", "ok.py", 5, "code", @@ -1080,14 +1083,14 @@ def test_get_user_submissions_status_and_scores_on_success(database, submit_lead result = db.get_user_submissions(user_id="5") assert len(result) == 1 assert result[0]["status"] == "done" - # Scores come back as Decimal from Postgres (as the existing `runs` - # score does); compare as float. - assert float(result[0]["public_score"]) == 1.5 + # Scores come back as Decimal from Postgres; compare as float. assert float(result[0]["secret_score"]) == 1.7 + # Public score is unchanged: still exposed per-run. + assert [float(r["score"]) for r in result[0]["runs"]] == [1.5] -def test_get_user_submissions_status_failed_when_run_failed(database, submit_leaderboard): - """A submission with a failed run reports status 'failed'.""" +def test_get_user_submissions_status_failed_when_secret_run_failed(database, submit_leaderboard): + """A failed secret run -> status 'failed'; runs/scores stay hidden (anti-cheat).""" failed = dataclasses.replace(sample_run_result(), passed=False) with database as db: sub = db.create_submission( @@ -1104,13 +1107,42 @@ def test_get_user_submissions_status_failed_when_run_failed(database, submit_lea result = db.get_user_submissions(user_id="5") assert len(result) == 1 assert result[0]["status"] == "failed" - # Ranking-eligibility (and thus the public score / runs) is withheld - # when the secret run failed, but the secret score stays None too. - assert result[0]["public_score"] is None + # The failed secret run withholds the public score (ranking filter) and + # there is no passing secret score either. assert result[0]["secret_score"] is None assert result[0]["runs"] == [] +def test_get_user_submissions_status_failed_keeps_runs_when_public_run_failed( + database, submit_leaderboard +): + """A failed *public* run still reports its (passing) runs, with status 'failed'. + + Unlike a failed secret run, a failed public run does not trigger the + anti-cheat full-hide, so the passing public runs remain visible. + """ + failed = dataclasses.replace(sample_run_result(), passed=False) + with database as db: + sub = db.create_submission( + "submit-leaderboard", "bad_public.py", 5, "code", + datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5", + ) + # Passing public test, failing public leaderboard run, passing secret. + _create_submission_run(db, sub, mode="test", secret=False, runner="A100") + _create_submission_run( + db, sub, mode="leaderboard", secret=False, runner="A100", + score=None, result=failed, + ) + _create_submission_run(db, sub, mode="leaderboard", secret=True, runner="A100", score=1.7) + db.mark_submission_done(sub) + + result = db.get_user_submissions(user_id="5") + assert len(result) == 1 + assert result[0]["status"] == "failed" + # The passing public test run is still present (not a full hide). + assert len(result[0]["runs"]) >= 1 + + def test_get_user_submissions_status_pending_when_not_done(database, submit_leaderboard): """A not-yet-finished submission reports status 'pending'.""" with database as db: From 93e566f50fb4e5c7eac168bf28abd5ab5d9fc766 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach Date: Tue, 23 Jun 2026 22:58:24 +0000 Subject: [PATCH 3/3] Assert additive contract in get_user_submissions test Pin the backward-compat guarantee in a test: the pre-existing top-level and per-run keys must remain present, so a future change can't silently drop a field existing clients (popcorn-cli, kernelboard) rely on. New fields are added alongside, never replacing. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_leaderboard_db.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py index f2d8ea1a..a738e7fd 100644 --- a/tests/test_leaderboard_db.py +++ b/tests/test_leaderboard_db.py @@ -1088,6 +1088,14 @@ def test_get_user_submissions_status_and_secret_score_on_success(database, submi # Public score is unchanged: still exposed per-run. assert [float(r["score"]) for r in result[0]["runs"]] == [1.5] + # Backward compat: the change is purely additive. The pre-existing + # fields must still be present so existing clients (popcorn-cli, + # kernelboard) keep working; new fields are added alongside. + assert {"id", "leaderboard_name", "file_name", "submission_time", "done", "runs"} <= set( + result[0] + ) + assert {"gpu_type", "score"} <= set(result[0]["runs"][0]) + def test_get_user_submissions_status_failed_when_secret_run_failed(database, submit_leaderboard): """A failed secret run -> status 'failed'; runs/scores stay hidden (anti-cheat)."""