Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 49 additions & 2 deletions src/libkernelbot/leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,7 +1257,11 @@ def get_user_submissions(
offset: Offset for pagination

Returns:
List of submission dictionaries with summary info and runs
List of submission dictionaries with summary info and runs. Each
entry includes ``status`` ("pending"/"failed"/"done") and
``secret_score`` (the secret leaderboard geomean score, the ranking
metric; ``None`` if absent). The public leaderboard score remains
available per-run in ``runs[].score``.
"""
# Validate and clamp inputs
limit = max(1, min(limit, 100))
Expand Down Expand Up @@ -1325,16 +1329,59 @@ def get_user_submissions(
"score": run_row[2],
})

# Per-submission status + secret score. The `runs` above already
# carry the public leaderboard score (in runs[].score), but they are
# ranking-filtered (anti-cheat: only public runs whose matching
# secret run passed) and never include secret runs, so two things
# are not derivable from them:
# - secret_score: the secret leaderboard run's score (the actual
# ranking metric). Visible to the owner, as the detail endpoint
# already exposes it; the list endpoint just never selected it.
# - whether any run failed, so a finished-but-failed submission can
# be told apart from a clean one (both otherwise look "done").
# One extra aggregate over the same runs rows (keyed by
# submission_id, like runs_query) avoids an N+1 detail fetch per row.
#
# MIN(score): a submission can have a secret leaderboard run per GPU;
# take the best (lowest) to match how the public score is summarized.
agg_query = """
SELECT submission_id,
MIN(score) FILTER (
WHERE mode = 'leaderboard' AND secret AND passed
) AS secret_score,
Comment on lines +1349 to +1351

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this here. Can you explain this to me? Why MIN(score)?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A submission can have a secret leaderboard run per GPU type, so there can be more than one secret score. MIN takes the best (lowest = fastest) one, matching how the existing public score is summarized for the row (the runs are likewise per-GPU and the caller/CLI takes the min). For single-GPU leaderboards like qr_v2 there's exactly one, so MIN is just that value. Happy to switch to per-GPU secret scores instead if you'd prefer symmetry with runs, but a single ranking number seemed more useful for the list view.

bool_or(NOT passed) AS has_failed_run
FROM leaderboard.runs
WHERE submission_id = ANY(%s)
GROUP BY submission_id
Comment on lines +1348 to +1355

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you benchmark how long this sequel query will take?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. On the docker-compose test Postgres, seeded with 100 submissions × 6 runs (600 rows), the added aggregate query runs at ~0.25 ms/call for a full 100-submission page, versus ~21 ms for the whole get_user_submissions call — about 1% overhead. It's the same access pattern as the existing runs query in this method (WHERE submission_id = ANY(%s), grouped) over the same rows, so no new join or table. And it replaces what a client otherwise needs up to 100 detail round-trips to compute.

"""
self.cursor.execute(agg_query, (submission_ids,))
agg_by_submission: dict = {
row[0]: {"secret_score": row[1], "has_failed_run": row[2]}
for row in self.cursor.fetchall()
}

# Build result with runs grouped by submission
results = []
for row in submissions:
sub_id = row[0]
done = row[4]
agg = agg_by_submission.get(sub_id, {})

if not done:
status = "pending"
elif agg.get("has_failed_run"):
status = "failed"
else:
status = "done"

results.append({
"id": sub_id,
"leaderboard_name": row[1],
"file_name": row[2],
"submission_time": row[3],
"done": row[4],
"done": done,
"status": status,
"secret_score": agg.get("secret_score"),
"runs": runs_by_submission.get(sub_id, []),
})
return results
Expand Down
100 changes: 100 additions & 0 deletions tests/test_leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,106 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard):
assert 2.0 in scores


def test_get_user_submissions_status_and_secret_score_on_success(database, submit_leaderboard):
"""A fully-passing submission reports status 'done' with the secret score.

The public score stays where it already was (runs[].score).
"""
with database as db:
sub = db.create_submission(
"submit-leaderboard", "ok.py", 5, "code",
datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5",
)
_create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5)
_create_submission_run(db, sub, mode="leaderboard", secret=True, runner="A100", score=1.7)
db.mark_submission_done(sub)

result = db.get_user_submissions(user_id="5")
assert len(result) == 1
assert result[0]["status"] == "done"
# Scores come back as Decimal from Postgres; compare as float.
assert float(result[0]["secret_score"]) == 1.7
# Public score is unchanged: still exposed per-run.
assert [float(r["score"]) for r in result[0]["runs"]] == [1.5]

# Backward compat: the change is purely additive. The pre-existing
# fields must still be present so existing clients (popcorn-cli,
# kernelboard) keep working; new fields are added alongside.
assert {"id", "leaderboard_name", "file_name", "submission_time", "done", "runs"} <= set(
result[0]
)
assert {"gpu_type", "score"} <= set(result[0]["runs"][0])


def test_get_user_submissions_status_failed_when_secret_run_failed(database, submit_leaderboard):
"""A failed secret run -> status 'failed'; runs/scores stay hidden (anti-cheat)."""
failed = dataclasses.replace(sample_run_result(), passed=False)
with database as db:
sub = db.create_submission(
"submit-leaderboard", "bad.py", 5, "code",
datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5",
)
_create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5)
_create_submission_run(
db, sub, mode="leaderboard", secret=True, runner="A100",
score=None, result=failed,
)
db.mark_submission_done(sub)

result = db.get_user_submissions(user_id="5")
assert len(result) == 1
assert result[0]["status"] == "failed"
# The failed secret run withholds the public score (ranking filter) and
# there is no passing secret score either.
assert result[0]["secret_score"] is None
assert result[0]["runs"] == []


def test_get_user_submissions_status_failed_keeps_runs_when_public_run_failed(
database, submit_leaderboard
):
"""A failed *public* run still reports its (passing) runs, with status 'failed'.

Unlike a failed secret run, a failed public run does not trigger the
anti-cheat full-hide, so the passing public runs remain visible.
"""
failed = dataclasses.replace(sample_run_result(), passed=False)
with database as db:
sub = db.create_submission(
"submit-leaderboard", "bad_public.py", 5, "code",
datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5",
)
# Passing public test, failing public leaderboard run, passing secret.
_create_submission_run(db, sub, mode="test", secret=False, runner="A100")
_create_submission_run(
db, sub, mode="leaderboard", secret=False, runner="A100",
score=None, result=failed,
)
_create_submission_run(db, sub, mode="leaderboard", secret=True, runner="A100", score=1.7)
db.mark_submission_done(sub)

result = db.get_user_submissions(user_id="5")
assert len(result) == 1
assert result[0]["status"] == "failed"
# The passing public test run is still present (not a full hide).
assert len(result[0]["runs"]) >= 1


def test_get_user_submissions_status_pending_when_not_done(database, submit_leaderboard):
"""A not-yet-finished submission reports status 'pending'."""
with database as db:
sub = db.create_submission(
"submit-leaderboard", "wip.py", 5, "code",
datetime.datetime.now(tz=datetime.timezone.utc), user_name="user5",
)
_create_submission_run(db, sub, mode="leaderboard", secret=False, runner="A100", score=1.5)
# Not marked done.

result = db.get_user_submissions(user_id="5")
assert len(result) == 1
assert result[0]["status"] == "pending"


def test_check_leaderboard_access_public(database, submit_leaderboard):
"""Public leaderboards grant access to everyone."""
with database as db:
Expand Down