terminal-bench

v2.0

Version 2.0 of Terminal-Bench, a benchmark for testing agents in terminal environments. More tasks, harder, and higher quality than 1.0.

uvx harbor run -d terminal-bench@2.0

Tasks (89)

adaptive-rejection-sampler
uvx harbor run -d terminal-bench@2.0 -t adaptive-rejection-sampler
69671fb
bn-fit-modify
uvx harbor run -d terminal-bench@2.0 -t bn-fit-modify
69671fb
break-filter-js-from-html
uvx harbor run -d terminal-bench@2.0 -t break-filter-js-from-html
69671fb
build-cython-ext
uvx harbor run -d terminal-bench@2.0 -t build-cython-ext
69671fb
build-pmars
uvx harbor run -d terminal-bench@2.0 -t build-pmars
69671fb
build-pov-ray
uvx harbor run -d terminal-bench@2.0 -t build-pov-ray
69671fb
caffe-cifar-10
uvx harbor run -d terminal-bench@2.0 -t caffe-cifar-10
69671fb
cancel-async-tasks
uvx harbor run -d terminal-bench@2.0 -t cancel-async-tasks
69671fb
chess-best-move
uvx harbor run -d terminal-bench@2.0 -t chess-best-move
69671fb
circuit-fibsqrt
uvx harbor run -d terminal-bench@2.0 -t circuit-fibsqrt
69671fb
cobol-modernization
uvx harbor run -d terminal-bench@2.0 -t cobol-modernization
69671fb
code-from-image
uvx harbor run -d terminal-bench@2.0 -t code-from-image
69671fb
compile-compcert
uvx harbor run -d terminal-bench@2.0 -t compile-compcert
69671fb
configure-git-webserver
uvx harbor run -d terminal-bench@2.0 -t configure-git-webserver
69671fb
constraints-scheduling
uvx harbor run -d terminal-bench@2.0 -t constraints-scheduling
69671fb
count-dataset-tokens
uvx harbor run -d terminal-bench@2.0 -t count-dataset-tokens
69671fb
crack-7z-hash
uvx harbor run -d terminal-bench@2.0 -t crack-7z-hash
69671fb
custom-memory-heap-crash
uvx harbor run -d terminal-bench@2.0 -t custom-memory-heap-crash
69671fb
db-wal-recovery
uvx harbor run -d terminal-bench@2.0 -t db-wal-recovery
69671fb
distribution-search
uvx harbor run -d terminal-bench@2.0 -t distribution-search
69671fb
dna-assembly
uvx harbor run -d terminal-bench@2.0 -t dna-assembly
69671fb
dna-insert
uvx harbor run -d terminal-bench@2.0 -t dna-insert
69671fb
extract-elf
uvx harbor run -d terminal-bench@2.0 -t extract-elf
69671fb
extract-moves-from-video
uvx harbor run -d terminal-bench@2.0 -t extract-moves-from-video
69671fb
feal-differential-cryptanalysis
uvx harbor run -d terminal-bench@2.0 -t feal-differential-cryptanalysis
69671fb
feal-linear-cryptanalysis
uvx harbor run -d terminal-bench@2.0 -t feal-linear-cryptanalysis
69671fb
filter-js-from-html
uvx harbor run -d terminal-bench@2.0 -t filter-js-from-html
69671fb
financial-document-processor
uvx harbor run -d terminal-bench@2.0 -t financial-document-processor
69671fb
fix-code-vulnerability
uvx harbor run -d terminal-bench@2.0 -t fix-code-vulnerability
69671fb
fix-git
uvx harbor run -d terminal-bench@2.0 -t fix-git
69671fb
fix-ocaml-gc
uvx harbor run -d terminal-bench@2.0 -t fix-ocaml-gc
69671fb
gcode-to-text
uvx harbor run -d terminal-bench@2.0 -t gcode-to-text
69671fb
git-leak-recovery
uvx harbor run -d terminal-bench@2.0 -t git-leak-recovery
69671fb
git-multibranch
uvx harbor run -d terminal-bench@2.0 -t git-multibranch
69671fb
gpt2-codegolf
uvx harbor run -d terminal-bench@2.0 -t gpt2-codegolf
69671fb
headless-terminal
uvx harbor run -d terminal-bench@2.0 -t headless-terminal
69671fb
hf-model-inference
uvx harbor run -d terminal-bench@2.0 -t hf-model-inference
69671fb
install-windows-3.11
uvx harbor run -d terminal-bench@2.0 -t install-windows-3.11
69671fb
kv-store-grpc
uvx harbor run -d terminal-bench@2.0 -t kv-store-grpc
69671fb
large-scale-text-editing
uvx harbor run -d terminal-bench@2.0 -t large-scale-text-editing
69671fb
largest-eigenval
uvx harbor run -d terminal-bench@2.0 -t largest-eigenval
69671fb
llm-inference-batching-scheduler
uvx harbor run -d terminal-bench@2.0 -t llm-inference-batching-scheduler
69671fb
log-summary-date-ranges
uvx harbor run -d terminal-bench@2.0 -t log-summary-date-ranges
69671fb
mailman
uvx harbor run -d terminal-bench@2.0 -t mailman
69671fb
make-doom-for-mips
uvx harbor run -d terminal-bench@2.0 -t make-doom-for-mips
69671fb
make-mips-interpreter
uvx harbor run -d terminal-bench@2.0 -t make-mips-interpreter
69671fb
mcmc-sampling-stan
uvx harbor run -d terminal-bench@2.0 -t mcmc-sampling-stan
69671fb
merge-diff-arc-agi-task
uvx harbor run -d terminal-bench@2.0 -t merge-diff-arc-agi-task
69671fb
model-extraction-relu-logits
uvx harbor run -d terminal-bench@2.0 -t model-extraction-relu-logits
69671fb
modernize-scientific-stack
uvx harbor run -d terminal-bench@2.0 -t modernize-scientific-stack
69671fb
mteb-leaderboard
uvx harbor run -d terminal-bench@2.0 -t mteb-leaderboard
69671fb
mteb-retrieve
uvx harbor run -d terminal-bench@2.0 -t mteb-retrieve
69671fb
multi-source-data-merger
uvx harbor run -d terminal-bench@2.0 -t multi-source-data-merger
69671fb
nginx-request-logging
uvx harbor run -d terminal-bench@2.0 -t nginx-request-logging
69671fb
openssl-selfsigned-cert
uvx harbor run -d terminal-bench@2.0 -t openssl-selfsigned-cert
69671fb
overfull-hbox
uvx harbor run -d terminal-bench@2.0 -t overfull-hbox
69671fb
password-recovery
uvx harbor run -d terminal-bench@2.0 -t password-recovery
69671fb
path-tracing
uvx harbor run -d terminal-bench@2.0 -t path-tracing
69671fb
path-tracing-reverse
uvx harbor run -d terminal-bench@2.0 -t path-tracing-reverse
69671fb
polyglot-c-py
uvx harbor run -d terminal-bench@2.0 -t polyglot-c-py
69671fb
polyglot-rust-c
uvx harbor run -d terminal-bench@2.0 -t polyglot-rust-c
69671fb
portfolio-optimization
uvx harbor run -d terminal-bench@2.0 -t portfolio-optimization
69671fb
protein-assembly
uvx harbor run -d terminal-bench@2.0 -t protein-assembly
69671fb
prove-plus-comm
uvx harbor run -d terminal-bench@2.0 -t prove-plus-comm
69671fb
pypi-server
uvx harbor run -d terminal-bench@2.0 -t pypi-server
69671fb
pytorch-model-cli
uvx harbor run -d terminal-bench@2.0 -t pytorch-model-cli
69671fb
pytorch-model-recovery
uvx harbor run -d terminal-bench@2.0 -t pytorch-model-recovery
69671fb
qemu-alpine-ssh
uvx harbor run -d terminal-bench@2.0 -t qemu-alpine-ssh
69671fb
qemu-startup
uvx harbor run -d terminal-bench@2.0 -t qemu-startup
69671fb
query-optimize
uvx harbor run -d terminal-bench@2.0 -t query-optimize
69671fb
raman-fitting
uvx harbor run -d terminal-bench@2.0 -t raman-fitting
69671fb
regex-chess
uvx harbor run -d terminal-bench@2.0 -t regex-chess
69671fb
regex-log
uvx harbor run -d terminal-bench@2.0 -t regex-log
69671fb
reshard-c4-data
uvx harbor run -d terminal-bench@2.0 -t reshard-c4-data
69671fb
rstan-to-pystan
uvx harbor run -d terminal-bench@2.0 -t rstan-to-pystan
69671fb
sam-cell-seg
uvx harbor run -d terminal-bench@2.0 -t sam-cell-seg
69671fb
sanitize-git-repo
uvx harbor run -d terminal-bench@2.0 -t sanitize-git-repo
69671fb
schemelike-metacircular-eval
uvx harbor run -d terminal-bench@2.0 -t schemelike-metacircular-eval
69671fb
sparql-university
uvx harbor run -d terminal-bench@2.0 -t sparql-university
69671fb
sqlite-db-truncate
uvx harbor run -d terminal-bench@2.0 -t sqlite-db-truncate
69671fb
sqlite-with-gcov
uvx harbor run -d terminal-bench@2.0 -t sqlite-with-gcov
69671fb
torch-pipeline-parallelism
uvx harbor run -d terminal-bench@2.0 -t torch-pipeline-parallelism
69671fb
torch-tensor-parallelism
uvx harbor run -d terminal-bench@2.0 -t torch-tensor-parallelism
69671fb
train-fasttext
uvx harbor run -d terminal-bench@2.0 -t train-fasttext
69671fb
tune-mjcf
uvx harbor run -d terminal-bench@2.0 -t tune-mjcf
69671fb
video-processing
uvx harbor run -d terminal-bench@2.0 -t video-processing
69671fb
vulnerable-secret
uvx harbor run -d terminal-bench@2.0 -t vulnerable-secret
69671fb
winning-avg-corewars
uvx harbor run -d terminal-bench@2.0 -t winning-avg-corewars
69671fb
write-compressor
uvx harbor run -d terminal-bench@2.0 -t write-compressor
69671fb