terminal-bench
v2.0Version 2.0 of Terminal-Bench, a benchmark for testing agents in terminal environments. More tasks, harder, and higher quality than 1.0.
uvx harbor run -d terminal-bench@2.0Tasks (89)
adaptive-rejection-sampler
uvx harbor run -d terminal-bench@2.0 -t adaptive-rejection-sampler69671fb
bn-fit-modify
uvx harbor run -d terminal-bench@2.0 -t bn-fit-modify69671fb
break-filter-js-from-html
uvx harbor run -d terminal-bench@2.0 -t break-filter-js-from-html69671fb
build-cython-ext
uvx harbor run -d terminal-bench@2.0 -t build-cython-ext69671fb
build-pmars
uvx harbor run -d terminal-bench@2.0 -t build-pmars69671fb
build-pov-ray
uvx harbor run -d terminal-bench@2.0 -t build-pov-ray69671fb
caffe-cifar-10
uvx harbor run -d terminal-bench@2.0 -t caffe-cifar-1069671fb
cancel-async-tasks
uvx harbor run -d terminal-bench@2.0 -t cancel-async-tasks69671fb
chess-best-move
uvx harbor run -d terminal-bench@2.0 -t chess-best-move69671fb
circuit-fibsqrt
uvx harbor run -d terminal-bench@2.0 -t circuit-fibsqrt69671fb
cobol-modernization
uvx harbor run -d terminal-bench@2.0 -t cobol-modernization69671fb
code-from-image
uvx harbor run -d terminal-bench@2.0 -t code-from-image69671fb
compile-compcert
uvx harbor run -d terminal-bench@2.0 -t compile-compcert69671fb
configure-git-webserver
uvx harbor run -d terminal-bench@2.0 -t configure-git-webserver69671fb
constraints-scheduling
uvx harbor run -d terminal-bench@2.0 -t constraints-scheduling69671fb
count-dataset-tokens
uvx harbor run -d terminal-bench@2.0 -t count-dataset-tokens69671fb
crack-7z-hash
uvx harbor run -d terminal-bench@2.0 -t crack-7z-hash69671fb
custom-memory-heap-crash
uvx harbor run -d terminal-bench@2.0 -t custom-memory-heap-crash69671fb
db-wal-recovery
uvx harbor run -d terminal-bench@2.0 -t db-wal-recovery69671fb
distribution-search
uvx harbor run -d terminal-bench@2.0 -t distribution-search69671fb
dna-assembly
uvx harbor run -d terminal-bench@2.0 -t dna-assembly69671fb
dna-insert
uvx harbor run -d terminal-bench@2.0 -t dna-insert69671fb
extract-elf
uvx harbor run -d terminal-bench@2.0 -t extract-elf69671fb
extract-moves-from-video
uvx harbor run -d terminal-bench@2.0 -t extract-moves-from-video69671fb
feal-differential-cryptanalysis
uvx harbor run -d terminal-bench@2.0 -t feal-differential-cryptanalysis69671fb
feal-linear-cryptanalysis
uvx harbor run -d terminal-bench@2.0 -t feal-linear-cryptanalysis69671fb
filter-js-from-html
uvx harbor run -d terminal-bench@2.0 -t filter-js-from-html69671fb
financial-document-processor
uvx harbor run -d terminal-bench@2.0 -t financial-document-processor69671fb
fix-code-vulnerability
uvx harbor run -d terminal-bench@2.0 -t fix-code-vulnerability69671fb
fix-git
uvx harbor run -d terminal-bench@2.0 -t fix-git69671fb
fix-ocaml-gc
uvx harbor run -d terminal-bench@2.0 -t fix-ocaml-gc69671fb
gcode-to-text
uvx harbor run -d terminal-bench@2.0 -t gcode-to-text69671fb
git-leak-recovery
uvx harbor run -d terminal-bench@2.0 -t git-leak-recovery69671fb
git-multibranch
uvx harbor run -d terminal-bench@2.0 -t git-multibranch69671fb
gpt2-codegolf
uvx harbor run -d terminal-bench@2.0 -t gpt2-codegolf69671fb
headless-terminal
uvx harbor run -d terminal-bench@2.0 -t headless-terminal69671fb
hf-model-inference
uvx harbor run -d terminal-bench@2.0 -t hf-model-inference69671fb
install-windows-3.11
uvx harbor run -d terminal-bench@2.0 -t install-windows-3.1169671fb
kv-store-grpc
uvx harbor run -d terminal-bench@2.0 -t kv-store-grpc69671fb
large-scale-text-editing
uvx harbor run -d terminal-bench@2.0 -t large-scale-text-editing69671fb
largest-eigenval
uvx harbor run -d terminal-bench@2.0 -t largest-eigenval69671fb
llm-inference-batching-scheduler
uvx harbor run -d terminal-bench@2.0 -t llm-inference-batching-scheduler69671fb
log-summary-date-ranges
uvx harbor run -d terminal-bench@2.0 -t log-summary-date-ranges69671fb
mailman
uvx harbor run -d terminal-bench@2.0 -t mailman69671fb
make-doom-for-mips
uvx harbor run -d terminal-bench@2.0 -t make-doom-for-mips69671fb
make-mips-interpreter
uvx harbor run -d terminal-bench@2.0 -t make-mips-interpreter69671fb
mcmc-sampling-stan
uvx harbor run -d terminal-bench@2.0 -t mcmc-sampling-stan69671fb
merge-diff-arc-agi-task
uvx harbor run -d terminal-bench@2.0 -t merge-diff-arc-agi-task69671fb
model-extraction-relu-logits
uvx harbor run -d terminal-bench@2.0 -t model-extraction-relu-logits69671fb
modernize-scientific-stack
uvx harbor run -d terminal-bench@2.0 -t modernize-scientific-stack69671fb
mteb-leaderboard
uvx harbor run -d terminal-bench@2.0 -t mteb-leaderboard69671fb
mteb-retrieve
uvx harbor run -d terminal-bench@2.0 -t mteb-retrieve69671fb
multi-source-data-merger
uvx harbor run -d terminal-bench@2.0 -t multi-source-data-merger69671fb
nginx-request-logging
uvx harbor run -d terminal-bench@2.0 -t nginx-request-logging69671fb
openssl-selfsigned-cert
uvx harbor run -d terminal-bench@2.0 -t openssl-selfsigned-cert69671fb
overfull-hbox
uvx harbor run -d terminal-bench@2.0 -t overfull-hbox69671fb
password-recovery
uvx harbor run -d terminal-bench@2.0 -t password-recovery69671fb
path-tracing
uvx harbor run -d terminal-bench@2.0 -t path-tracing69671fb
path-tracing-reverse
uvx harbor run -d terminal-bench@2.0 -t path-tracing-reverse69671fb
polyglot-c-py
uvx harbor run -d terminal-bench@2.0 -t polyglot-c-py69671fb
polyglot-rust-c
uvx harbor run -d terminal-bench@2.0 -t polyglot-rust-c69671fb
portfolio-optimization
uvx harbor run -d terminal-bench@2.0 -t portfolio-optimization69671fb
protein-assembly
uvx harbor run -d terminal-bench@2.0 -t protein-assembly69671fb
prove-plus-comm
uvx harbor run -d terminal-bench@2.0 -t prove-plus-comm69671fb
pypi-server
uvx harbor run -d terminal-bench@2.0 -t pypi-server69671fb
pytorch-model-cli
uvx harbor run -d terminal-bench@2.0 -t pytorch-model-cli69671fb
pytorch-model-recovery
uvx harbor run -d terminal-bench@2.0 -t pytorch-model-recovery69671fb
qemu-alpine-ssh
uvx harbor run -d terminal-bench@2.0 -t qemu-alpine-ssh69671fb
qemu-startup
uvx harbor run -d terminal-bench@2.0 -t qemu-startup69671fb
query-optimize
uvx harbor run -d terminal-bench@2.0 -t query-optimize69671fb
raman-fitting
uvx harbor run -d terminal-bench@2.0 -t raman-fitting69671fb
regex-chess
uvx harbor run -d terminal-bench@2.0 -t regex-chess69671fb
regex-log
uvx harbor run -d terminal-bench@2.0 -t regex-log69671fb
reshard-c4-data
uvx harbor run -d terminal-bench@2.0 -t reshard-c4-data69671fb
rstan-to-pystan
uvx harbor run -d terminal-bench@2.0 -t rstan-to-pystan69671fb
sam-cell-seg
uvx harbor run -d terminal-bench@2.0 -t sam-cell-seg69671fb
sanitize-git-repo
uvx harbor run -d terminal-bench@2.0 -t sanitize-git-repo69671fb
schemelike-metacircular-eval
uvx harbor run -d terminal-bench@2.0 -t schemelike-metacircular-eval69671fb
sparql-university
uvx harbor run -d terminal-bench@2.0 -t sparql-university69671fb
sqlite-db-truncate
uvx harbor run -d terminal-bench@2.0 -t sqlite-db-truncate69671fb
sqlite-with-gcov
uvx harbor run -d terminal-bench@2.0 -t sqlite-with-gcov69671fb
torch-pipeline-parallelism
uvx harbor run -d terminal-bench@2.0 -t torch-pipeline-parallelism69671fb
torch-tensor-parallelism
uvx harbor run -d terminal-bench@2.0 -t torch-tensor-parallelism69671fb
train-fasttext
uvx harbor run -d terminal-bench@2.0 -t train-fasttext69671fb
tune-mjcf
uvx harbor run -d terminal-bench@2.0 -t tune-mjcf69671fb
video-processing
uvx harbor run -d terminal-bench@2.0 -t video-processing69671fb
vulnerable-secret
uvx harbor run -d terminal-bench@2.0 -t vulnerable-secret69671fb
winning-avg-corewars
uvx harbor run -d terminal-bench@2.0 -t winning-avg-corewars69671fb
write-compressor
uvx harbor run -d terminal-bench@2.0 -t write-compressor69671fb