researchcodebench

v1.0

ResearchCodeBench evaluates AI agents' ability to implement algorithms from academic papers. Contains 212 code implementation tasks across 20 ML/AI research problems from top-tier venues (ICLR, NeurIPS, CVPR, COLM). Tests paper comprehension, algorithm understanding, and precise code implementation skills with 1,449 lines of reference code.

uvx harbor run -d researchcodebench@1.0

Tasks (212)

gps_gumbel_distribution_similarity_transformation
uvx harbor run -d researchcodebench@1.0 -t gps_gumbel_distribution_similarity_transformation
69581ca
gps_pairwise_expansion
uvx harbor run -d researchcodebench@1.0 -t gps_pairwise_expansion
69581ca
grid-cell-conformal-isometry__dx_to_theta_id_dr
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry__dx_to_theta_id_dr
69581ca
grid-cell-conformal-isometry_activity_metrics
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_activity_metrics
69581ca
grid-cell-conformal-isometry_apply_additive_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_additive_transformation
69581ca
grid-cell-conformal-isometry_apply_nonlinear_transform
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_nonlinear_transform
69581ca
grid-cell-conformal-isometry_apply_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_apply_transformation
69581ca
grid-cell-conformal-isometry_calculate_distance_preservation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_calculate_distance_preservation
69581ca
grid-cell-conformal-isometry_compute_neural_representations
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_neural_representations
69581ca
grid-cell-conformal-isometry_compute_transformation_error
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_transformation_error
69581ca
grid-cell-conformal-isometry_compute_transformation_matrix
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_compute_transformation_matrix
69581ca
grid-cell-conformal-isometry_conformal_isometry_loss
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_conformal_isometry_loss
69581ca
grid-cell-conformal-isometry_linear_transformation
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_linear_transformation
69581ca
grid-cell-conformal-isometry_nonlinear_transformation_1
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_nonlinear_transformation_1
69581ca
grid-cell-conformal-isometry_prepare_displacement
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_prepare_displacement
69581ca
grid-cell-conformal-isometry_prepare_inputs
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_prepare_inputs
69581ca
grid-cell-conformal-isometry_transformation_loss
uvx harbor run -d researchcodebench@1.0 -t grid-cell-conformal-isometry_transformation_loss
69581ca
hyla_mlp_linear
uvx harbor run -d researchcodebench@1.0 -t hyla_mlp_linear
69581ca
hyla_mlp_relu
uvx harbor run -d researchcodebench@1.0 -t hyla_mlp_relu
69581ca
hyla_rms_head
uvx harbor run -d researchcodebench@1.0 -t hyla_rms_head
69581ca
len_combine_gradients
uvx harbor run -d researchcodebench@1.0 -t len_combine_gradients
69581ca
len_compute_gradients
uvx harbor run -d researchcodebench@1.0 -t len_compute_gradients
69581ca
len_compute_schur_decomposition
uvx harbor run -d researchcodebench@1.0 -t len_compute_schur_decomposition
69581ca
len_construct_jacobian_matrix
uvx harbor run -d researchcodebench@1.0 -t len_construct_jacobian_matrix
69581ca
len_gamma_update_function
uvx harbor run -d researchcodebench@1.0 -t len_gamma_update_function
69581ca
len_gradient_descent_ascent_field
uvx harbor run -d researchcodebench@1.0 -t len_gradient_descent_ascent_field
69581ca
len_initialize_variables
uvx harbor run -d researchcodebench@1.0 -t len_initialize_variables
69581ca
len_jacobian_computation
uvx harbor run -d researchcodebench@1.0 -t len_jacobian_computation
69581ca
len_lazy_extra_newton
uvx harbor run -d researchcodebench@1.0 -t len_lazy_extra_newton
69581ca
len_main_iteration_loop
uvx harbor run -d researchcodebench@1.0 -t len_main_iteration_loop
69581ca
len_split_input_and_compute_norm
uvx harbor run -d researchcodebench@1.0 -t len_split_input_and_compute_norm
69581ca
len_split_input_vector
uvx harbor run -d researchcodebench@1.0 -t len_split_input_vector
69581ca
len_track_metrics
uvx harbor run -d researchcodebench@1.0 -t len_track_metrics
69581ca
len_update_steps
uvx harbor run -d researchcodebench@1.0 -t len_update_steps
69581ca
llm-sci-use_ai_text_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_ai_text_probabilities
69581ca
llm-sci-use_alpha_estimation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_alpha_estimation
69581ca
llm-sci-use_alpha_optimization
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_alpha_optimization
69581ca
llm-sci-use_bootstrap_inference
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_bootstrap_inference
69581ca
llm-sci-use_bootstrap_sampling
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_bootstrap_sampling
69581ca
llm-sci-use_confidence_interval_calculation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_confidence_interval_calculation
69581ca
llm-sci-use_data_loading
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_data_loading
69581ca
llm-sci-use_data_preprocessing
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_data_preprocessing
69581ca
llm-sci-use_human_text_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_human_text_probabilities
69581ca
llm-sci-use_inference_pipeline
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_inference_pipeline
69581ca
llm-sci-use_loading_token_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_loading_token_probabilities
69581ca
llm-sci-use_log_likelihood_optimization
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_log_likelihood_optimization
69581ca
llm-sci-use_mixture_log_likelihood_calculation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_mixture_log_likelihood_calculation
69581ca
llm-sci-use_precompute_probabilities
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_precompute_probabilities
69581ca
llm-sci-use_token_distribution_computation
uvx harbor run -d researchcodebench@1.0 -t llm-sci-use_token_distribution_computation
69581ca
minp_apply_filter_to_scores
uvx harbor run -d researchcodebench@1.0 -t minp_apply_filter_to_scores
69581ca
minp_convert_logits_to_probabilities
uvx harbor run -d researchcodebench@1.0 -t minp_convert_logits_to_probabilities
69581ca
minp_ensure_minimum_tokens_are_kept
uvx harbor run -d researchcodebench@1.0 -t minp_ensure_minimum_tokens_are_kept
69581ca
minp_find_maximum_probability_token
uvx harbor run -d researchcodebench@1.0 -t minp_find_maximum_probability_token
69581ca
minp_identify_tokens_to_remove
uvx harbor run -d researchcodebench@1.0 -t minp_identify_tokens_to_remove
69581ca
minp_min-p_sampling
uvx harbor run -d researchcodebench@1.0 -t minp_min-p_sampling
69581ca
minp_scale_min_p_threshold
uvx harbor run -d researchcodebench@1.0 -t minp_scale_min_p_threshold
69581ca
optimalsteps_dynamic_programming_1
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_1
69581ca
optimalsteps_dynamic_programming_2
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_2
69581ca
optimalsteps_dynamic_programming_3
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_3
69581ca
optimalsteps_dynamic_programming_4
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_4
69581ca
optimalsteps_dynamic_programming_for_each_sample
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_for_each_sample
69581ca
optimalsteps_dynamic_programming_initialization_for_each_sample
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_initialization_for_each_sample
69581ca
optimalsteps_dynamic_programming_search
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_search
69581ca
optimalsteps_dynamic_programming_traceback
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_dynamic_programming_traceback
69581ca
optimalsteps_search_oss_function
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_search_oss_function
69581ca
optimalsteps_teacher_trajectory_computation
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_teacher_trajectory_computation
69581ca
optimalsteps_teacher_trajectory_computation_for_each_timestep
uvx harbor run -d researchcodebench@1.0 -t optimalsteps_teacher_trajectory_computation_for_each_timestep
69581ca
repa-e_check1
uvx harbor run -d researchcodebench@1.0 -t repa-e_check1
69581ca
repa-e_check2
uvx harbor run -d researchcodebench@1.0 -t repa-e_check2
69581ca
repa-e_sit_loss
uvx harbor run -d researchcodebench@1.0 -t repa-e_sit_loss
69581ca
repa-e_sitbatchnorm_eps_is_set_to_1e-4
uvx harbor run -d researchcodebench@1.0 -t repa-e_sitbatchnorm_eps_is_set_to_1e-4
69581ca
repa-e_vae_loss
uvx harbor run -d researchcodebench@1.0 -t repa-e_vae_loss
69581ca
schedule_free_averaging_update
uvx harbor run -d researchcodebench@1.0 -t schedule_free_averaging_update
69581ca
schedule_free_averaging_update_and_interpolation
uvx harbor run -d researchcodebench@1.0 -t schedule_free_averaging_update_and_interpolation
69581ca
schedule_free_bias_correction
uvx harbor run -d researchcodebench@1.0 -t schedule_free_bias_correction
69581ca
schedule_free_decay_the_first_and_second_moment_running_average_coefficient
uvx harbor run -d researchcodebench@1.0 -t schedule_free_decay_the_first_and_second_moment_running_average_coefficient
69581ca
schedule_free_denom
uvx harbor run -d researchcodebench@1.0 -t schedule_free_denom
69581ca
schedule_free_dynamic_weighting
uvx harbor run -d researchcodebench@1.0 -t schedule_free_dynamic_weighting
69581ca
schedule_free_dynamic_weighting_and_averaging
uvx harbor run -d researchcodebench@1.0 -t schedule_free_dynamic_weighting_and_averaging
69581ca
schedule_free_gradient_evaluation_interpolation
uvx harbor run -d researchcodebench@1.0 -t schedule_free_gradient_evaluation_interpolation
69581ca
schedule_free_learning_rate
uvx harbor run -d researchcodebench@1.0 -t schedule_free_learning_rate
69581ca
schedule_free_moment_running_average_coefficient
uvx harbor run -d researchcodebench@1.0 -t schedule_free_moment_running_average_coefficient
69581ca
schedule_free_step
uvx harbor run -d researchcodebench@1.0 -t schedule_free_step
69581ca
schedule_free_warmup_schedule
uvx harbor run -d researchcodebench@1.0 -t schedule_free_warmup_schedule
69581ca
schedule_free_weight_decay
uvx harbor run -d researchcodebench@1.0 -t schedule_free_weight_decay
69581ca
schedule_free_z_update
uvx harbor run -d researchcodebench@1.0 -t schedule_free_z_update
69581ca
semanticist_apply_embedding_masks
uvx harbor run -d researchcodebench@1.0 -t semanticist_apply_embedding_masks
69581ca
semanticist_decode_multiple_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_decode_multiple_tokens
69581ca
semanticist_decode_single_token
uvx harbor run -d researchcodebench@1.0 -t semanticist_decode_single_token
69581ca
semanticist_generate_main_function
uvx harbor run -d researchcodebench@1.0 -t semanticist_generate_main_function
69581ca
semanticist_generate_remaining_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_generate_remaining_tokens
69581ca
semanticist_get_cfg_scale
uvx harbor run -d researchcodebench@1.0 -t semanticist_get_cfg_scale
69581ca
semanticist_initialize_sequence
uvx harbor run -d researchcodebench@1.0 -t semanticist_initialize_sequence
69581ca
semanticist_prefill_initial_tokens
uvx harbor run -d researchcodebench@1.0 -t semanticist_prefill_initial_tokens
69581ca
semanticist_prefill_sequence
uvx harbor run -d researchcodebench@1.0 -t semanticist_prefill_sequence
69581ca
semanticist_setup_conditions
uvx harbor run -d researchcodebench@1.0 -t semanticist_setup_conditions
69581ca
semanticist_setup_sequence_parameters
uvx harbor run -d researchcodebench@1.0 -t semanticist_setup_sequence_parameters
69581ca
siss_compute_loss_terms
uvx harbor run -d researchcodebench@1.0 -t siss_compute_loss_terms
69581ca
siss_create_defensive_mixture
uvx harbor run -d researchcodebench@1.0 -t siss_create_defensive_mixture
69581ca
siss_double_forward_passes
uvx harbor run -d researchcodebench@1.0 -t siss_double_forward_passes
69581ca