researchcodebench
v1.0ResearchCodeBench evaluates AI agents' ability to implement algorithms from academic papers. Contains 212 code implementation tasks across 20 ML/AI research problems from top-tier venues (ICLR, NeurIPS, CVPR, COLM). Tests paper comprehension, algorithm understanding, and precise code implementation skills with 1,449 lines of reference code.
uvx harbor run -d researchcodebench@1.0Tasks (212)
gps_gumbel_distribution_similarity_transformation
uvx harbor run -d researchcodebench@1.0 -t gps_gumbel_distribution_similarity_transformation69581ca
gps_pairwise_expansion
uvx harbor run -d researchcodebench@1.0 -t gps_pairwise_expansion69581ca
grid-cell-conformal-isometry__dx_to_theta_id_dr
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry__dx_to_theta_id_dr69581ca
grid-cell-conformal-isometry_activity_metrics
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_activity_metrics69581ca
grid-cell-conformal-isometry_apply_additive_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_additive_transformation69581ca
grid-cell-conformal-isometry_apply_nonlinear_transform
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_nonlinear_transform69581ca
grid-cell-conformal-isometry_apply_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_transformation69581ca
grid-cell-conformal-isometry_calculate_distance_preservation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_calculate_distance_preservation69581ca
grid-cell-conformal-isometry_compute_neural_representations
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_neural_representations69581ca
grid-cell-conformal-isometry_compute_transformation_error
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_transformation_error69581ca
grid-cell-conformal-isometry_compute_transformation_matrix
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_transformation_matrix69581ca
grid-cell-conformal-isometry_conformal_isometry_loss
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_conformal_isometry_loss69581ca
grid-cell-conformal-isometry_linear_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_linear_transformation69581ca
grid-cell-conformal-isometry_nonlinear_transformation_1
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_nonlinear_transformation_169581ca
grid-cell-conformal-isometry_prepare_displacement
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_prepare_displacement69581ca
grid-cell-conformal-isometry_prepare_inputs
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_prepare_inputs69581ca
grid-cell-conformal-isometry_transformation_loss
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_transformation_loss69581ca
hyla_mlp_linear
uvx harbor run -d researchcodebench@1.0 -t hyla_mlp_linear69581ca
hyla_mlp_relu
uvx harbor run -d researchcodebench@1.0 -t hyla_mlp_relu69581ca
hyla_rms_head
uvx harbor run -d researchcodebench@1.0 -t hyla_rms_head69581ca
len_combine_gradients
uvx harbor run -d researchcodebench@1.0 -t len_combine_gradients69581ca
len_compute_gradients
uvx harbor run -d researchcodebench@1.0 -t len_compute_gradients69581ca
len_compute_schur_decomposition
uvx harbor run -d researchcodebench@1.0 -t len_compute_schur_decomposition69581ca
len_construct_jacobian_matrix
uvx harbor run -d researchcodebench@1.0 -t len_construct_jacobian_matrix69581ca
len_gamma_update_function
uvx harbor run -d researchcodebench@1.0 -t len_gamma_update_function69581ca
len_gradient_descent_ascent_field
uvx harbor run -d researchcodebench@1.0 -t len_gradient_descent_ascent_field69581ca
len_initialize_variables
uvx harbor run -d researchcodebench@1.0 -t len_initialize_variables69581ca
len_jacobian_computation
uvx harbor run -d researchcodebench@1.0 -t len_jacobian_computation69581ca
len_lazy_extra_newton
uvx harbor run -d researchcodebench@1.0 -t len_lazy_extra_newton69581ca
len_main_iteration_loop
uvx harbor run -d researchcodebench@1.0 -t len_main_iteration_loop69581ca
len_split_input_and_compute_norm
uvx harbor run -d researchcodebench@1.0 -t len_split_input_and_compute_norm69581ca
len_split_input_vector
uvx harbor run -d researchcodebench@1.0 -t len_split_input_vector69581ca
len_track_metrics
uvx harbor run -d researchcodebench@1.0 -t len_track_metrics69581ca
len_update_steps
uvx harbor run -d researchcodebench@1.0 -t len_update_steps69581ca
llm-sci-use_ai_text_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_ai_text_probabilities69581ca
llm-sci-use_alpha_estimation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_alpha_estimation69581ca
llm-sci-use_alpha_optimization
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_alpha_optimization69581ca
llm-sci-use_bootstrap_inference
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_bootstrap_inference69581ca
llm-sci-use_bootstrap_sampling
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_bootstrap_sampling69581ca
llm-sci-use_confidence_interval_calculation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_confidence_interval_calculation69581ca
llm-sci-use_data_loading
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_data_loading69581ca
llm-sci-use_data_preprocessing
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_data_preprocessing69581ca
llm-sci-use_human_text_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_human_text_probabilities69581ca
llm-sci-use_inference_pipeline
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_inference_pipeline69581ca
llm-sci-use_loading_token_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_loading_token_probabilities69581ca
llm-sci-use_log_likelihood_optimization
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_log_likelihood_optimization69581ca
llm-sci-use_mixture_log_likelihood_calculation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_mixture_log_likelihood_calculation69581ca
llm-sci-use_precompute_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_precompute_probabilities69581ca
llm-sci-use_token_distribution_computation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_token_distribution_computation69581ca
minp_apply_filter_to_scores
uvx harbor run -d researchcodebench@1.0 -t minp_apply_filter_to_scores69581ca
minp_convert_logits_to_probabilities
uvx harbor run -d researchcodebench@1.0 -t minp_convert_logits_to_probabilities69581ca
minp_ensure_minimum_tokens_are_kept
uvx harbor run -d researchcodebench@1.0 -t minp_ensure_minimum_tokens_are_kept69581ca
minp_find_maximum_probability_token
uvx harbor run -d researchcodebench@1.0 -t minp_find_maximum_probability_token69581ca
minp_identify_tokens_to_remove
uvx harbor run -d researchcodebench@1.0 -t minp_identify_tokens_to_remove69581ca
minp_min-p_sampling
uvx harbor run -d researchcodebench@1.0 -t minp_min-p_sampling69581ca
minp_scale_min_p_threshold
uvx harbor run -d researchcodebench@1.0 -t minp_scale_min_p_threshold69581ca
optimalsteps_dynamic_programming_1
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_169581ca
optimalsteps_dynamic_programming_2
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_269581ca
optimalsteps_dynamic_programming_3
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_369581ca
optimalsteps_dynamic_programming_4
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_469581ca
optimalsteps_dynamic_programming_for_each_sample
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_for_each_sample69581ca
optimalsteps_dynamic_programming_initialization_for_each_sample
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_initialization_for_each_sample69581ca
optimalsteps_dynamic_programming_search
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_search69581ca
optimalsteps_dynamic_programming_traceback
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_traceback69581ca
optimalsteps_search_oss_function
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_search_oss_function69581ca
optimalsteps_teacher_trajectory_computation
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_teacher_trajectory_computation69581ca
optimalsteps_teacher_trajectory_computation_for_each_timestep
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_teacher_trajectory_computation_for_each_timestep69581ca
repa-e_check1
uvx harbor run -d researchcodebench@1.0 -t repa-e_check169581ca
repa-e_check2
uvx harbor run -d researchcodebench@1.0 -t repa-e_check269581ca
repa-e_sit_loss
uvx harbor run -d researchcodebench@1.0 -t repa-e_sit_loss69581ca
repa-e_sitbatchnorm_eps_is_set_to_1e-4
uvx harbor run -d researchcodebench@1.0 -t repa-e_sitbatchnorm_eps_is_set_to_1e-469581ca
repa-e_vae_loss
uvx harbor run -d researchcodebench@1.0 -t repa-e_vae_loss69581ca
schedule_free_averaging_update
uvx harbor run -d researchcodebench@1.0 -t schedule_free_averaging_update69581ca
schedule_free_averaging_update_and_interpolation
uvx harbor run -d researchcodebench@1.0 -t schedule_free_averaging_update_and_interpolation69581ca
schedule_free_bias_correction
uvx harbor run -d researchcodebench@1.0 -t schedule_free_bias_correction69581ca
schedule_free_decay_the_first_and_second_moment_running_average_coefficient
uvx harbor run -d researchcodebench@1.0 -t schedule_free_decay_the_first_and_second_moment_running_average_coefficient69581ca
schedule_free_denom
uvx harbor run -d researchcodebench@1.0 -t schedule_free_denom69581ca
schedule_free_dynamic_weighting
uvx harbor run -d researchcodebench@1.0 -t schedule_free_dynamic_weighting69581ca
schedule_free_dynamic_weighting_and_averaging
uvx harbor run -d researchcodebench@1.0 -t schedule_free_dynamic_weighting_and_averaging69581ca
schedule_free_gradient_evaluation_interpolation
uvx harbor run -d researchcodebench@1.0 -t schedule_free_gradient_evaluation_interpolation69581ca
schedule_free_learning_rate
uvx harbor run -d researchcodebench@1.0 -t schedule_free_learning_rate69581ca
schedule_free_moment_running_average_coefficient
uvx harbor run -d researchcodebench@1.0 -t schedule_free_moment_running_average_coefficient69581ca
schedule_free_step
uvx harbor run -d researchcodebench@1.0 -t schedule_free_step69581ca
schedule_free_warmup_schedule
uvx harbor run -d researchcodebench@1.0 -t schedule_free_warmup_schedule69581ca
schedule_free_weight_decay
uvx harbor run -d researchcodebench@1.0 -t schedule_free_weight_decay69581ca
schedule_free_z_update
uvx harbor run -d researchcodebench@1.0 -t schedule_free_z_update69581ca
semanticist_apply_embedding_masks
uvx harbor run -d researchcodebench@1.0 -t semanticist_apply_embedding_masks69581ca
semanticist_decode_multiple_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_decode_multiple_tokens69581ca
semanticist_decode_single_token
uvx harbor run -d researchcodebench@1.0 -t semanticist_decode_single_token69581ca
semanticist_generate_main_function
uvx harbor run -d researchcodebench@1.0 -t semanticist_generate_main_function69581ca
semanticist_generate_remaining_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_generate_remaining_tokens69581ca
semanticist_get_cfg_scale
uvx harbor run -d researchcodebench@1.0 -t semanticist_get_cfg_scale69581ca
semanticist_initialize_sequence
uvx harbor run -d researchcodebench@1.0 -t semanticist_initialize_sequence69581ca
semanticist_prefill_initial_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_prefill_initial_tokens69581ca
semanticist_prefill_sequence
uvx harbor run -d researchcodebench@1.0 -t semanticist_prefill_sequence69581ca
semanticist_setup_conditions
uvx harbor run -d researchcodebench@1.0 -t semanticist_setup_conditions69581ca
semanticist_setup_sequence_parameters
uvx harbor run -d researchcodebench@1.0 -t semanticist_setup_sequence_parameters69581ca
siss_compute_loss_terms
uvx harbor run -d researchcodebench@1.0 -t siss_compute_loss_terms69581ca
siss_create_defensive_mixture
uvx harbor run -d researchcodebench@1.0 -t siss_create_defensive_mixture69581ca
siss_double_forward_passes
uvx harbor run -d researchcodebench@1.0 -t siss_double_forward_passes69581ca