spreadsheetbench-verified

v1.0

A benchmark evaluating AI agents on real-world spreadsheet manipulation tasks (400 tasks from verified_400). Tasks involve Excel file manipulation including formula writing, data transformation, formatting, and conditional logic.

uvx harbor run -d spreadsheetbench-verified@1.0

Tasks (400)

353-29
uvx harbor run -d spreadsheetbench-verified@1.0 -t 353-29
0ad7209
353-6
uvx harbor run -d spreadsheetbench-verified@1.0 -t 353-6
0ad7209
35739
uvx harbor run -d spreadsheetbench-verified@1.0 -t 35739
0ad7209
35742
uvx harbor run -d spreadsheetbench-verified@1.0 -t 35742
0ad7209
35747
uvx harbor run -d spreadsheetbench-verified@1.0 -t 35747
0ad7209
359-21
uvx harbor run -d spreadsheetbench-verified@1.0 -t 359-21
0ad7209
36097
uvx harbor run -d spreadsheetbench-verified@1.0 -t 36097
0ad7209
36191
uvx harbor run -d spreadsheetbench-verified@1.0 -t 36191
0ad7209
36277
uvx harbor run -d spreadsheetbench-verified@1.0 -t 36277
0ad7209
367-23
uvx harbor run -d spreadsheetbench-verified@1.0 -t 367-23
0ad7209
36764
uvx harbor run -d spreadsheetbench-verified@1.0 -t 36764
0ad7209
370-43
uvx harbor run -d spreadsheetbench-verified@1.0 -t 370-43
0ad7209
37086
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37086
0ad7209
37229
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37229
0ad7209
37378
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37378
0ad7209
374-18
uvx harbor run -d spreadsheetbench-verified@1.0 -t 374-18
0ad7209
374-31
uvx harbor run -d spreadsheetbench-verified@1.0 -t 374-31
0ad7209
37456
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37456
0ad7209
37462
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37462
0ad7209
37554
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37554
0ad7209
379-36
uvx harbor run -d spreadsheetbench-verified@1.0 -t 379-36
0ad7209
37900
uvx harbor run -d spreadsheetbench-verified@1.0 -t 37900
0ad7209
38074
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38074
0ad7209
382-10
uvx harbor run -d spreadsheetbench-verified@1.0 -t 382-10
0ad7209
382-29
uvx harbor run -d spreadsheetbench-verified@1.0 -t 382-29
0ad7209
384-4
uvx harbor run -d spreadsheetbench-verified@1.0 -t 384-4
0ad7209
38462
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38462
0ad7209
38537
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38537
0ad7209
387-16
uvx harbor run -d spreadsheetbench-verified@1.0 -t 387-16
0ad7209
38703
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38703
0ad7209
388-47
uvx harbor run -d spreadsheetbench-verified@1.0 -t 388-47
0ad7209
38823
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38823
0ad7209
38969
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38969
0ad7209
38985
uvx harbor run -d spreadsheetbench-verified@1.0 -t 38985
0ad7209
39046
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39046
0ad7209
3911
uvx harbor run -d spreadsheetbench-verified@1.0 -t 3911
0ad7209
39190
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39190
0ad7209
39432
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39432
0ad7209
395-36
uvx harbor run -d spreadsheetbench-verified@1.0 -t 395-36
0ad7209
39515
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39515
0ad7209
39667
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39667
0ad7209
398-14
uvx harbor run -d spreadsheetbench-verified@1.0 -t 398-14
0ad7209
399-14
uvx harbor run -d spreadsheetbench-verified@1.0 -t 399-14
0ad7209
39903
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39903
0ad7209
39931
uvx harbor run -d spreadsheetbench-verified@1.0 -t 39931
0ad7209
402-43
uvx harbor run -d spreadsheetbench-verified@1.0 -t 402-43
0ad7209
40478
uvx harbor run -d spreadsheetbench-verified@1.0 -t 40478
0ad7209
40757
uvx harbor run -d spreadsheetbench-verified@1.0 -t 40757
0ad7209
408-39
uvx harbor run -d spreadsheetbench-verified@1.0 -t 408-39
0ad7209
408-5
uvx harbor run -d spreadsheetbench-verified@1.0 -t 408-5
0ad7209
40892
uvx harbor run -d spreadsheetbench-verified@1.0 -t 40892
0ad7209
409-45
uvx harbor run -d spreadsheetbench-verified@1.0 -t 409-45
0ad7209
40959
uvx harbor run -d spreadsheetbench-verified@1.0 -t 40959
0ad7209
41-47
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41-47
0ad7209
41265
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41265
0ad7209
41348
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41348
0ad7209
414-20
uvx harbor run -d spreadsheetbench-verified@1.0 -t 414-20
0ad7209
41410
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41410
0ad7209
41420
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41420
0ad7209
41589
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41589
0ad7209
416-15
uvx harbor run -d spreadsheetbench-verified@1.0 -t 416-15
0ad7209
416-27
uvx harbor run -d spreadsheetbench-verified@1.0 -t 416-27
0ad7209
41601
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41601
0ad7209
41691
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41691
0ad7209
41969
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41969
0ad7209
41978
uvx harbor run -d spreadsheetbench-verified@1.0 -t 41978
0ad7209
42181
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42181
0ad7209
42198
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42198
0ad7209
42216
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42216
0ad7209
423-16
uvx harbor run -d spreadsheetbench-verified@1.0 -t 423-16
0ad7209
42354
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42354
0ad7209
42515
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42515
0ad7209
42526
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42526
0ad7209
42902
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42902
0ad7209
42930
uvx harbor run -d spreadsheetbench-verified@1.0 -t 42930
0ad7209
43213
uvx harbor run -d spreadsheetbench-verified@1.0 -t 43213
0ad7209
433-47
uvx harbor run -d spreadsheetbench-verified@1.0 -t 433-47
0ad7209
43436
uvx harbor run -d spreadsheetbench-verified@1.0 -t 43436
0ad7209
43589
uvx harbor run -d spreadsheetbench-verified@1.0 -t 43589
0ad7209
43657
uvx harbor run -d spreadsheetbench-verified@1.0 -t 43657
0ad7209
438-18
uvx harbor run -d spreadsheetbench-verified@1.0 -t 438-18
0ad7209
440-24
uvx harbor run -d spreadsheetbench-verified@1.0 -t 440-24
0ad7209
44017
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44017
0ad7209
44266
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44266
0ad7209
44296
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44296
0ad7209
44389
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44389
0ad7209
44628
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44628
0ad7209
448-11
uvx harbor run -d spreadsheetbench-verified@1.0 -t 448-11
0ad7209
44913
uvx harbor run -d spreadsheetbench-verified@1.0 -t 44913
0ad7209
45063
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45063
0ad7209
45300
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45300
0ad7209
45372
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45372
0ad7209
455-35
uvx harbor run -d spreadsheetbench-verified@1.0 -t 455-35
0ad7209
45635
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45635
0ad7209
45707
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45707
0ad7209
45738
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45738
0ad7209
45896
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45896
0ad7209
45937
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45937
0ad7209
45944
uvx harbor run -d spreadsheetbench-verified@1.0 -t 45944
0ad7209
46121
uvx harbor run -d spreadsheetbench-verified@1.0 -t 46121
0ad7209