Skip to content

Tasks#

Task loader reads task.toml + instruction.md and produces Inspect AI Task objects.

inspect_coco.tasks.loader #

Task loader — reads task.toml + instruction.md and produces Inspect Tasks.

IDDThresholdError #

Bases: Exception

Raised when instruction IDD score is below threshold in strict mode.

coco_task(task_dir, timeout_sec=900, epochs=None, idd_threshold=None, idd_strict=False) #

Load a CoCo eval task from a task.toml directory.

Reads task configuration, instruction, and test script. Runs IDD pre-check on the instruction and configures the Inspect Task with auto-epochs for consistency measurement.

Parameters:

Name Type Description Default
task_dir str

Path to directory containing task.toml + instruction.md.

required
timeout_sec int

Default agent timeout (overridden by task.toml).

900
epochs int | None

Number of epochs for pass@k (default: 3, overridden by task.toml).

None
idd_threshold float | None

IDD score threshold (default: 0.6, overridden by task.toml).

None
idd_strict bool

If True, fail below threshold instead of warning.

False
Source code in src/inspect_coco/tasks/loader.py
@task
def coco_task(
    task_dir: str,
    timeout_sec: int = 900,
    epochs: int | None = None,
    idd_threshold: float | None = None,
    idd_strict: bool = False,
) -> Task:
    """Load a CoCo eval task from a task.toml directory.

    Reads task configuration, instruction, and test script. Runs IDD
    pre-check on the instruction and configures the Inspect Task with
    auto-epochs for consistency measurement.

    Args:
        task_dir: Path to directory containing task.toml + instruction.md.
        timeout_sec: Default agent timeout (overridden by task.toml).
        epochs: Number of epochs for pass@k (default: 3, overridden by task.toml).
        idd_threshold: IDD score threshold (default: 0.6, overridden by task.toml).
        idd_strict: If True, fail below threshold instead of warning.
    """
    task_path = Path(task_dir)

    # Load task.toml
    config = _load_task_config(task_path)
    metadata = config.get("metadata", {})
    agent_config = config.get("agent", {})
    env_config = config.get("environment", {})

    # Load instruction.md
    instruction = _load_instruction(task_path)

    # IDD pre-check
    threshold = idd_threshold or metadata.get("idd_threshold", DEFAULT_IDD_THRESHOLD)
    strict = idd_strict or metadata.get("idd_strict", False)
    idd_metadata = _run_idd_check(instruction, threshold, strict, task_path.name)

    # Merge IDD scores into task metadata (persisted in eval log)
    metadata = {**metadata, **idd_metadata}

    # Resolve epochs (auto-epochs for consistency measurement)
    resolved_epochs = epochs or metadata.get("epochs", DEFAULT_EPOCHS)

    # Resolve sandbox
    sandbox_spec = _resolve_sandbox(task_path, env_config)

    # Build dataset
    dataset = _build_dataset(instruction, task_path)

    # Build agent solver
    agent_solver = as_solver(
        coco(
            timeout_sec=agent_config.get("timeout_sec", timeout_sec),
            max_turns=agent_config.get("max_turns"),
            remove_skills=agent_config.get("remove_skills"),
            model_name=agent_config.get("model"),
            connection_name=agent_config.get("connection"),
            workdir=agent_config.get("workdir", "/workspace"),
        )
    )

    # Build scorers
    test_cmd = _resolve_test_cmd(task_path, env_config)
    scorers = [
        verification(test_cmd=test_cmd, timeout=env_config.get("test_timeout", 300)),
        idd_quality(instruction=instruction, threshold=threshold),
    ]

    return Task(
        dataset=dataset,
        solver=agent_solver,
        scorer=scorers,
        sandbox=sandbox_spec,
        epochs=resolved_epochs,
        name=metadata.get("name", task_path.name),
        metadata=metadata,
    )