researchcodebench
v1.0ResearchCodeBench evaluates AI agents' ability to implement algorithms from academic papers. Contains 212 code implementation tasks across 20 ML/AI research problems from top-tier venues (ICLR, NeurIPS, CVPR, COLM). Tests paper comprehension, algorithm understanding, and precise code implementation skills with 1,449 lines of reference code.
uvx harbor run -d researchcodebench@1.0Tasks (212)
advantage-alignment_aa_terms1
uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms169581ca
advantage-alignment_aa_terms2
uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms269581ca
advantage-alignment_aa_terms3
uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_aa_terms369581ca
advantage-alignment_integrated_aa
uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_integrated_aa69581ca
advantage-alignment_proximal_surrogate
uvx harbor run -d researchcodebench@1.0 -t advantage-alignment_proximal_surrogate69581ca
diff-transformer_empirical_lambda_init_function
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_empirical_lambda_init_function69581ca
diff-transformer_groupnorm_with_scaling
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_groupnorm_with_scaling69581ca
diff-transformer_lambda_setup
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_lambda_setup69581ca
diff-transformer_multihead_differential_attention
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_multihead_differential_attention69581ca
diff-transformer_normalization_setup
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_normalization_setup69581ca
diff-transformer_re-parameterize_lambda
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_re-parameterize_lambda69581ca
diff-transformer_taking_difference
uvx harbor run -d researchcodebench@1.0 -t diff-transformer_taking_difference69581ca
diffusiondpo_calculate_model_losses
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_calculate_model_losses69581ca
diffusiondpo_calculate_reference_losses
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_calculate_reference_losses69581ca
diffusiondpo_dpo_loss_computation
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_dpo_loss_computation69581ca
diffusiondpo_dpo_loss_formula
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_dpo_loss_formula69581ca
diffusiondpo_implicit_accuracy_metric
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_implicit_accuracy_metric69581ca
diffusiondpo_loss_computation
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_loss_computation69581ca
diffusiondpo_raw_model_loss_metric
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_raw_model_loss_metric69581ca
diffusiondpo_raw_ref_loss
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_raw_ref_loss69581ca
diffusiondpo_sft_loss
uvx harbor run -d researchcodebench@1.0 -t diffusiondpo_sft_loss69581ca
dyt_alpha
uvx harbor run -d researchcodebench@1.0 -t dyt_alpha69581ca
dyt_bias
uvx harbor run -d researchcodebench@1.0 -t dyt_bias69581ca
dyt_channel_split
uvx harbor run -d researchcodebench@1.0 -t dyt_channel_split69581ca
dyt_channels_first
uvx harbor run -d researchcodebench@1.0 -t dyt_channels_first69581ca
dyt_channels_last
uvx harbor run -d researchcodebench@1.0 -t dyt_channels_last69581ca
dyt_forward
uvx harbor run -d researchcodebench@1.0 -t dyt_forward69581ca
dyt_init
uvx harbor run -d researchcodebench@1.0 -t dyt_init69581ca
dyt_tanh
uvx harbor run -d researchcodebench@1.0 -t dyt_tanh69581ca
dyt_weight
uvx harbor run -d researchcodebench@1.0 -t dyt_weight69581ca
eomt_activation
uvx harbor run -d researchcodebench@1.0 -t eomt_activation69581ca
eomt_add_query_tokens
uvx harbor run -d researchcodebench@1.0 -t eomt_add_query_tokens69581ca
eomt_apply_masking_probability
uvx harbor run -d researchcodebench@1.0 -t eomt_apply_masking_probability69581ca
eomt_calculate_upscale_parameters
uvx harbor run -d researchcodebench@1.0 -t eomt_calculate_upscale_parameters69581ca
eomt_convolution_1
uvx harbor run -d researchcodebench@1.0 -t eomt_convolution_169581ca
eomt_convolution_2
uvx harbor run -d researchcodebench@1.0 -t eomt_convolution_269581ca
eomt_create_attention_mask
uvx harbor run -d researchcodebench@1.0 -t eomt_create_attention_mask69581ca
eomt_custom_attention_mechanism
uvx harbor run -d researchcodebench@1.0 -t eomt_custom_attention_mechanism69581ca
eomt_disable_attention_mask
uvx harbor run -d researchcodebench@1.0 -t eomt_disable_attention_mask69581ca
eomt_eomt_initialization
uvx harbor run -d researchcodebench@1.0 -t eomt_eomt_initialization69581ca
eomt_extract_query_key_value
uvx harbor run -d researchcodebench@1.0 -t eomt_extract_query_key_value69581ca
eomt_extract_query_tokens
uvx harbor run -d researchcodebench@1.0 -t eomt_extract_query_tokens69581ca
eomt_final_prediction
uvx harbor run -d researchcodebench@1.0 -t eomt_final_prediction69581ca
eomt_forward_pass
uvx harbor run -d researchcodebench@1.0 -t eomt_forward_pass69581ca
eomt_generate_class_logits
uvx harbor run -d researchcodebench@1.0 -t eomt_generate_class_logits69581ca
eomt_generate_mask_logits
uvx harbor run -d researchcodebench@1.0 -t eomt_generate_mask_logits69581ca
eomt_generate_masks
uvx harbor run -d researchcodebench@1.0 -t eomt_generate_masks69581ca
eomt_initialize_class_prediction_head
uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_class_prediction_head69581ca
eomt_initialize_mask_prediction_head
uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_mask_prediction_head69581ca
eomt_initialize_query_embedding
uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_query_embedding69581ca
eomt_initialize_tracking_variables
uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_tracking_variables69581ca
eomt_initialize_upscale_module
uvx harbor run -d researchcodebench@1.0 -t eomt_initialize_upscale_module69581ca
eomt_input_normalization
uvx harbor run -d researchcodebench@1.0 -t eomt_input_normalization69581ca
eomt_normalization
uvx harbor run -d researchcodebench@1.0 -t eomt_normalization69581ca
eomt_patch_embedding
uvx harbor run -d researchcodebench@1.0 -t eomt_patch_embedding69581ca
eomt_predict_from_features
uvx harbor run -d researchcodebench@1.0 -t eomt_predict_from_features69581ca
eomt_predict_masks_and_classes
uvx harbor run -d researchcodebench@1.0 -t eomt_predict_masks_and_classes69581ca
eomt_process_attention_mask
uvx harbor run -d researchcodebench@1.0 -t eomt_process_attention_mask69581ca
eomt_process_image_features
uvx harbor run -d researchcodebench@1.0 -t eomt_process_image_features69581ca
eomt_process_transformer_blocks
uvx harbor run -d researchcodebench@1.0 -t eomt_process_transformer_blocks69581ca
eomt_project_attention_output
uvx harbor run -d researchcodebench@1.0 -t eomt_project_attention_output69581ca
eomt_random_mask_disable
uvx harbor run -d researchcodebench@1.0 -t eomt_random_mask_disable69581ca
eomt_register_attention_mask_probabilities
uvx harbor run -d researchcodebench@1.0 -t eomt_register_attention_mask_probabilities69581ca
eomt_scale_block_forward
uvx harbor run -d researchcodebench@1.0 -t eomt_scale_block_forward69581ca
eomt_scale_block_initialization
uvx harbor run -d researchcodebench@1.0 -t eomt_scale_block_initialization69581ca
eomt_store_parameters
uvx harbor run -d researchcodebench@1.0 -t eomt_store_parameters69581ca
eomt_transformer_block_forward
uvx harbor run -d researchcodebench@1.0 -t eomt_transformer_block_forward69581ca
fractalgen_cfg_schedule
uvx harbor run -d researchcodebench@1.0 -t fractalgen_cfg_schedule69581ca
fractalgen_chunk_mask_to_pred
uvx harbor run -d researchcodebench@1.0 -t fractalgen_chunk_mask_to_pred69581ca
fractalgen_concatenate_patches
uvx harbor run -d researchcodebench@1.0 -t fractalgen_concatenate_patches69581ca
fractalgen_init_token_mask__patches__orders__num_iter
uvx harbor run -d researchcodebench@1.0 -t fractalgen_init_token_mask__patches__orders__num_iter69581ca
fractalgen_mask_by_order
uvx harbor run -d researchcodebench@1.0 -t fractalgen_mask_by_order69581ca
fractalgen_mask_ratio_and_mask_length
uvx harbor run -d researchcodebench@1.0 -t fractalgen_mask_ratio_and_mask_length69581ca
fractalgen_next_level_sample_function
uvx harbor run -d researchcodebench@1.0 -t fractalgen_next_level_sample_function69581ca
fractalgen_patchify
uvx harbor run -d researchcodebench@1.0 -t fractalgen_patchify69581ca
fractalgen_random_masking
uvx harbor run -d researchcodebench@1.0 -t fractalgen_random_masking69581ca
fractalgen_random_masking_uniform
uvx harbor run -d researchcodebench@1.0 -t fractalgen_random_masking_uniform69581ca
fractalgen_sample_image
uvx harbor run -d researchcodebench@1.0 -t fractalgen_sample_image69581ca
fractalgen_sample_orders
uvx harbor run -d researchcodebench@1.0 -t fractalgen_sample_orders69581ca
fractalgen_unpatchify
uvx harbor run -d researchcodebench@1.0 -t fractalgen_unpatchify69581ca
gmflow_apply_2nd_order_gm
uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_2nd_order_gm69581ca
gmflow_apply_mask_to_gm
uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_mask_to_gm69581ca
gmflow_apply_probabilistic_cfg
uvx harbor run -d researchcodebench@1.0 -t gmflow_apply_probabilistic_cfg69581ca
gmflow_compute_final_nll
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_final_nll69581ca
gmflow_compute_gaussian_log_likelihood
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_gaussian_log_likelihood69581ca
gmflow_compute_gaussian_mask
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_gaussian_mask69581ca
gmflow_compute_inverse_stds
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_inverse_stds69581ca
gmflow_compute_noise_scale
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_noise_scale69581ca
gmflow_compute_timestep_coefficients
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_timestep_coefficients69581ca
gmflow_compute_weighted_difference
uvx harbor run -d researchcodebench@1.0 -t gmflow_compute_weighted_difference69581ca
gmflow_convert_model_output_to_x0
uvx harbor run -d researchcodebench@1.0 -t gmflow_convert_model_output_to_x069581ca
gmflow_gm_kl_loss
uvx harbor run -d researchcodebench@1.0 -t gmflow_gm_kl_loss69581ca
gmflow_main_sampling_loop
uvx harbor run -d researchcodebench@1.0 -t gmflow_main_sampling_loop69581ca
gmflow_perform_gm_sdeode_step
uvx harbor run -d researchcodebench@1.0 -t gmflow_perform_gm_sdeode_step69581ca
gmflow_prepare_timestep_tensor
uvx harbor run -d researchcodebench@1.0 -t gmflow_prepare_timestep_tensor69581ca
gmflow_set_timesteps
uvx harbor run -d researchcodebench@1.0 -t gmflow_set_timesteps69581ca
gps_chamfer_distance
uvx harbor run -d researchcodebench@1.0 -t gps_chamfer_distance69581ca
gps_distance_between_two_sets
uvx harbor run -d researchcodebench@1.0 -t gps_distance_between_two_sets69581ca
gps_euclidean_distance_computation
uvx harbor run -d researchcodebench@1.0 -t gps_euclidean_distance_computation69581ca
gps_gps_reparameterization_with_minimum_euclidean_distance
uvx harbor run -d researchcodebench@1.0 -t gps_gps_reparameterization_with_minimum_euclidean_distance69581ca