researchcodebench

v1.0

ResearchCodeBench evaluates AI agents' ability to implement algorithms from academic papers. Contains 212 code implementation tasks across 20 ML/AI research problems from top-tier venues (ICLR, NeurIPS, CVPR, COLM). Tests paper comprehension, algorithm understanding, and precise code implementation skills with 1,449 lines of reference code.

uvx harbor run -d researchcodebench@1.0

Tasks (212)

advantage-alignment_aa_terms1

uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms1

advantage-alignment_aa_terms2

uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms2

advantage-alignment_aa_terms3

uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms3

advantage-alignment_integrated_aa

uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_integrated_aa

advantage-alignment_proximal_surrogate

uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_proximal_surrogate

diff-transformer_empirical_lambda_init_function

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_empirical_lambda_init_function

diff-transformer_groupnorm_with_scaling

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_groupnorm_with_scaling

diff-transformer_lambda_setup

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_lambda_setup

diff-transformer_multihead_differential_attention

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_multihead_differential_attention

diff-transformer_normalization_setup

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_normalization_setup

diff-transformer_re-parameterize_lambda

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_re-parameterize_lambda

diff-transformer_taking_difference

uvx harbor run -d researchcodebench@1.0 -t diff-transformer_taking_difference

diffusiondpo_calculate_model_losses

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_calculate_model_losses

diffusiondpo_calculate_reference_losses

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_calculate_reference_losses

diffusiondpo_dpo_loss_computation

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_dpo_loss_computation

diffusiondpo_dpo_loss_formula

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_dpo_loss_formula

diffusiondpo_implicit_accuracy_metric

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_implicit_accuracy_metric

diffusiondpo_loss_computation

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_loss_computation

diffusiondpo_raw_model_loss_metric

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_raw_model_loss_metric

diffusiondpo_raw_ref_loss

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_raw_ref_loss

diffusiondpo_sft_loss

uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_sft_loss

uvx harbor run -d researchcodebench@1.0 -t dyt_alpha

uvx harbor run -d researchcodebench@1.0 -t dyt_bias

dyt_channel_split

uvx harbor run -d researchcodebench@1.0 -t dyt_channel_split

dyt_channels_first

uvx harbor run -d researchcodebench@1.0 -t dyt_channels_first

dyt_channels_last

uvx harbor run -d researchcodebench@1.0 -t dyt_channels_last

uvx harbor run -d researchcodebench@1.0 -t dyt_forward

uvx harbor run -d researchcodebench@1.0 -t dyt_init

uvx harbor run -d researchcodebench@1.0 -t dyt_tanh

uvx harbor run -d researchcodebench@1.0 -t dyt_weight

eomt_activation

uvx harbor run -d researchcodebench@1.0 -t eomt_activation

eomt_add_query_tokens

uvx harbor run -d researchcodebench@1.0 -t eomt_add_query_tokens

eomt_apply_masking_probability

uvx harbor run -d researchcodebench@1.0 -t eomt_apply_masking_probability

eomt_calculate_upscale_parameters

uvx harbor run -d researchcodebench@1.0 -t eomt_calculate_upscale_parameters

eomt_convolution_1

uvx harbor run -d researchcodebench@1.0 -t eomt_convolution_1

eomt_convolution_2

uvx harbor run -d researchcodebench@1.0 -t eomt_convolution_2

eomt_create_attention_mask

uvx harbor run -d researchcodebench@1.0 -t eomt_create_attention_mask

eomt_custom_attention_mechanism

uvx harbor run -d researchcodebench@1.0 -t eomt_custom_attention_mechanism

eomt_disable_attention_mask

uvx harbor run -d researchcodebench@1.0 -t eomt_disable_attention_mask

eomt_eomt_initialization

uvx harbor run -d researchcodebench@1.0 -t eomt_eomt_initialization

eomt_extract_query_key_value

uvx harbor run -d researchcodebench@1.0 -t eomt_extract_query_key_value

eomt_extract_query_tokens

uvx harbor run -d researchcodebench@1.0 -t eomt_extract_query_tokens

eomt_final_prediction

uvx harbor run -d researchcodebench@1.0 -t eomt_final_prediction

eomt_forward_pass

uvx harbor run -d researchcodebench@1.0 -t eomt_forward_pass

eomt_generate_class_logits

uvx harbor run -d researchcodebench@1.0 -t eomt_generate_class_logits

eomt_generate_mask_logits

uvx harbor run -d researchcodebench@1.0 -t eomt_generate_mask_logits

eomt_generate_masks

uvx harbor run -d researchcodebench@1.0 -t eomt_generate_masks

eomt_initialize_class_prediction_head

uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_class_prediction_head

eomt_initialize_mask_prediction_head

uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_mask_prediction_head

eomt_initialize_query_embedding

uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_query_embedding

eomt_initialize_tracking_variables

uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_tracking_variables

eomt_initialize_upscale_module

uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_upscale_module

eomt_input_normalization

uvx harbor run -d researchcodebench@1.0 -t eomt_input_normalization

eomt_normalization

uvx harbor run -d researchcodebench@1.0 -t eomt_normalization

eomt_patch_embedding

uvx harbor run -d researchcodebench@1.0 -t eomt_patch_embedding

eomt_predict_from_features

uvx harbor run -d researchcodebench@1.0 -t eomt_predict_from_features

eomt_predict_masks_and_classes

uvx harbor run -d researchcodebench@1.0 -t eomt_predict_masks_and_classes

eomt_process_attention_mask

uvx harbor run -d researchcodebench@1.0 -t eomt_process_attention_mask

eomt_process_image_features

uvx harbor run -d researchcodebench@1.0 -t eomt_process_image_features

eomt_process_transformer_blocks

uvx harbor run -d researchcodebench@1.0 -t eomt_process_transformer_blocks

eomt_project_attention_output

uvx harbor run -d researchcodebench@1.0 -t eomt_project_attention_output

eomt_random_mask_disable

uvx harbor run -d researchcodebench@1.0 -t eomt_random_mask_disable

eomt_register_attention_mask_probabilities

uvx harbor run -d researchcodebench@1.0 -t eomt_register_attention_mask_probabilities

eomt_scale_block_forward

uvx harbor run -d researchcodebench@1.0 -t eomt_scale_block_forward

eomt_scale_block_initialization

uvx harbor run -d researchcodebench@1.0 -t eomt_scale_block_initialization

eomt_store_parameters

uvx harbor run -d researchcodebench@1.0 -t eomt_store_parameters

eomt_transformer_block_forward

uvx harbor run -d researchcodebench@1.0 -t eomt_transformer_block_forward

fractalgen_cfg_schedule

uvx harbor run -d researchcodebench@1.0 -t fractalgen_cfg_schedule

fractalgen_chunk_mask_to_pred

uvx harbor run -d researchcodebench@1.0 -t fractalgen_chunk_mask_to_pred

fractalgen_concatenate_patches

uvx harbor run -d researchcodebench@1.0 -t fractalgen_concatenate_patches

fractalgen_init_token_mask__patches__orders__num_iter

uvx harbor run -d researchcodebench@1.0 -t fractalgen_init_token_mask__patches__orders__num_iter

fractalgen_mask_by_order

uvx harbor run -d researchcodebench@1.0 -t fractalgen_mask_by_order

fractalgen_mask_ratio_and_mask_length

uvx harbor run -d researchcodebench@1.0 -t fractalgen_mask_ratio_and_mask_length

fractalgen_next_level_sample_function

uvx harbor run -d researchcodebench@1.0 -t fractalgen_next_level_sample_function

fractalgen_patchify

uvx harbor run -d researchcodebench@1.0 -t fractalgen_patchify

fractalgen_random_masking

uvx harbor run -d researchcodebench@1.0 -t fractalgen_random_masking

fractalgen_random_masking_uniform

uvx harbor run -d researchcodebench@1.0 -t fractalgen_random_masking_uniform

fractalgen_sample_image

uvx harbor run -d researchcodebench@1.0 -t fractalgen_sample_image

fractalgen_sample_orders

uvx harbor run -d researchcodebench@1.0 -t fractalgen_sample_orders

fractalgen_unpatchify

uvx harbor run -d researchcodebench@1.0 -t fractalgen_unpatchify

gmflow_apply_2nd_order_gm

uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_2nd_order_gm

gmflow_apply_mask_to_gm

uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_mask_to_gm

gmflow_apply_probabilistic_cfg

uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_probabilistic_cfg

gmflow_compute_final_nll

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_final_nll

gmflow_compute_gaussian_log_likelihood

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_gaussian_log_likelihood

gmflow_compute_gaussian_mask

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_gaussian_mask

gmflow_compute_inverse_stds

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_inverse_stds

gmflow_compute_noise_scale

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_noise_scale

gmflow_compute_timestep_coefficients

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_timestep_coefficients

gmflow_compute_weighted_difference

uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_weighted_difference

gmflow_convert_model_output_to_x0

uvx harbor run -d researchcodebench@1.0 -t gmflow_convert_model_output_to_x0

gmflow_gm_kl_loss

uvx harbor run -d researchcodebench@1.0 -t gmflow_gm_kl_loss

gmflow_main_sampling_loop

uvx harbor run -d researchcodebench@1.0 -t gmflow_main_sampling_loop

gmflow_perform_gm_sdeode_step

uvx harbor run -d researchcodebench@1.0 -t gmflow_perform_gm_sdeode_step

gmflow_prepare_timestep_tensor

uvx harbor run -d researchcodebench@1.0 -t gmflow_prepare_timestep_tensor

gmflow_set_timesteps

uvx harbor run -d researchcodebench@1.0 -t gmflow_set_timesteps

gps_chamfer_distance

uvx harbor run -d researchcodebench@1.0 -t gps_chamfer_distance

gps_distance_between_two_sets

uvx harbor run -d researchcodebench@1.0 -t gps_distance_between_two_sets

gps_euclidean_distance_computation

uvx harbor run -d researchcodebench@1.0 -t gps_euclidean_distance_computation

gps_gps_reparameterization_with_minimum_euclidean_distance

uvx harbor run -d researchcodebench@1.0 -t gps_gps_reparameterization_with_minimum_euclidean_distance