ml-dev-bench
v1.0ML-Dev-Bench: A benchmark for testing AI agents on machine learning development tasks including model implementation, training, debugging, and optimization.
uvx harbor run -d ml-dev-bench@1.0Tasks (33)
ml_dev_bench_basic_vision_finetuning
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_basic_vision_finetuning044856a
ml_dev_bench_bert_eval_debug
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_bert_eval_debug044856a
ml_dev_bench_boolq_performance
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_boolq_performance044856a
ml_dev_bench_channel_vit_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_channel_vit_implementation044856a
ml_dev_bench_channel_vit_implementation_easy
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_channel_vit_implementation_easy044856a
ml_dev_bench_channel_vit_implementation_no_test
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_channel_vit_implementation_no_test044856a
ml_dev_bench_cifar_10_lt_performance
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_cifar_10_lt_performance044856a
ml_dev_bench_cifar100_performance
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_cifar100_performance044856a
ml_dev_bench_dataset_not_available_download
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_dataset_not_available_download044856a
ml_dev_bench_dataset_preprocess
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_dataset_preprocess044856a
ml_dev_bench_full_train_workflow_performance_test
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_full_train_workflow_performance_test044856a
ml_dev_bench_full_train_workflow_setup_test
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_full_train_workflow_setup_test044856a
ml_dev_bench_improve_cifar10_baseline
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_improve_cifar10_baseline044856a
ml_dev_bench_improve_segmentation_baseline
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_improve_segmentation_baseline044856a
ml_dev_bench_lora_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_lora_implementation044856a
ml_dev_bench_mcts_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_mcts_implementation044856a
ml_dev_bench_mla_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_mla_implementation044856a
ml_dev_bench_mla_implementation_hidden_tests
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_mla_implementation_hidden_tests044856a
ml_dev_bench_nan_loss_debug
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_nan_loss_debug044856a
ml_dev_bench_noisy_dataset_download
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_noisy_dataset_download044856a
ml_dev_bench_noisy_label_annotation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_noisy_label_annotation044856a
ml_dev_bench_normalization_bug
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_normalization_bug044856a
ml_dev_bench_parse_logs
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_parse_logs044856a
ml_dev_bench_ppo_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_ppo_implementation044856a
ml_dev_bench_pretrained_bert_base_uncased_load
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_pretrained_bert_base_uncased_load044856a
ml_dev_bench_pretrained_model_load_from_torchvision
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_pretrained_model_load_from_torchvision044856a
ml_dev_bench_shape_mismatch_output
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_shape_mismatch_output044856a
ml_dev_bench_shape_mismatch_train
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_shape_mismatch_train044856a
ml_dev_bench_small_dataset_overfit
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_small_dataset_overfit044856a
ml_dev_bench_training_files_debug
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_training_files_debug044856a
ml_dev_bench_var_implementation
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_var_implementation044856a
ml_dev_bench_vit_debugging
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_vit_debugging044856a
ml_dev_bench_wandb_logging
uvx harbor run -d ml-dev-bench@1.0 -t ml_dev_bench_wandb_logging044856a