Skip to content

Scorers#

Scorers run after the agent completes and produce Score objects with metrics.

Verification#

Runs test commands in the sandbox and reports pass/fail.

inspect_coco.scorers.verification #

Verification scorer — runs test command in sandbox and reports pass/fail.

verification(test_cmd='bash /workspace/tests/test.sh', timeout=300) #

Score by running a test command in the sandbox.

Executes the test command inside the Docker sandbox after the agent completes. Exit code 0 = pass (1.0), non-zero = fail (0.0).

Parameters:

Name Type Description Default
test_cmd str

Shell command to run in the sandbox.

'bash /workspace/tests/test.sh'
timeout int

Maximum seconds for test execution.

300
Source code in src/inspect_coco/scorers/verification.py
@scorer(metrics=[pass_rate()], name="Verification")
def verification(test_cmd: str = "bash /workspace/tests/test.sh", timeout: int = 300):
    """Score by running a test command in the sandbox.

    Executes the test command inside the Docker sandbox after the agent
    completes. Exit code 0 = pass (1.0), non-zero = fail (0.0).

    Args:
        test_cmd: Shell command to run in the sandbox.
        timeout: Maximum seconds for test execution.
    """

    async def score(state: TaskState, target: Target) -> Score:
        result = await sandbox().exec(
            cmd=["bash", "-c", test_cmd],
            timeout=timeout,
        )

        is_pass = result.returncode == 0

        return Score(
            value=1.0 if is_pass else 0.0,
            answer=state.output.completion if state.output else "",
            explanation=result.stdout if is_pass else f"EXIT {result.returncode}\n{result.stderr}",
        )

    return score

IDD Quality#

Reports instruction quality as a score in the eval summary.

inspect_coco.scorers.idd_quality #

IDD quality scorer — surfaces instruction quality score in eval results.

idd_quality(instruction, threshold=0.6) #

Score the IDD quality of the instruction.

This scorer runs once per sample and reports the IDD quality score. It does not depend on sandbox execution.

Parameters:

Name Type Description Default
instruction str

The raw instruction text to score.

required
threshold float

Pass threshold for IDD score.

0.6
Source code in src/inspect_coco/scorers/idd_quality.py
@scorer(metrics=[idd_score()], name="IDD Quality")
def idd_quality(instruction: str, threshold: float = 0.6):
    """Score the IDD quality of the instruction.

    This scorer runs once per sample and reports the IDD quality score.
    It does not depend on sandbox execution.

    Args:
        instruction: The raw instruction text to score.
        threshold: Pass threshold for IDD score.
    """
    from inspect_coco.idd import score_instruction

    idd = score_instruction(instruction)

    async def score(state: TaskState, target: Target) -> Score:
        return Score(
            value=idd.total,
            answer=f"goal={idd.goal.score:.2f} req={idd.requirements.score:.2f} "
            f"con={idd.constraints.score:.2f} out={idd.output.score:.2f}",
            explanation=f"IDD={idd.total:.2f} (threshold={threshold})",
            metadata={
                "idd_goal": idd.goal.score,
                "idd_requirements": idd.requirements.score,
                "idd_constraints": idd.constraints.score,
                "idd_output": idd.output.score,
                "idd_passed": idd.total >= threshold,
            },
        )

    return score

idd_score() #

Average IDD quality score across samples.

Source code in src/inspect_coco/scorers/idd_quality.py
@metric(name="IDD Score")
def idd_score() -> Metric:
    """Average IDD quality score across samples."""

    def metric_fn(scores: list[SampleScore]) -> Value:
        vals = [s.score.value for s in scores if isinstance(s.score.value, int | float)]
        return round(sum(vals) / len(vals), 2) if vals else 0.0

    return metric_fn