-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsql_solutions_mega.sql
More file actions
67 lines (61 loc) · 2.72 KB
/
Copy pathsql_solutions_mega.sql
File metadata and controls
67 lines (61 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
\timing on
SET http_retries = 12;
SET http_retry_wait_ms = 5000;
DROP INDEX IF EXISTS solutions_idx;
DROP VIEW IF EXISTS solutions_v;
DROP VIEW IF EXISTS sols_cf_v;
DROP VIEW IF EXISTS sols_mh_v;
DROP VIEW IF EXISTS sols_rc_v;
-- Codeforces accepted submissions: the full `default` split (~11.4M rows),
-- read straight from Hugging Face. submission_id is a globally unique natural
-- id, so the view has no window function and the index sink builds in parallel.
CREATE VIEW sols_cf_v AS
SELECT submission_id::BIGINT AS id,
problem_id AS task_id,
source AS code,
length(source) AS code_len,
programmingLanguage AS lang,
timeconsumedmillis::INTEGER AS time_ms,
(memoryconsumedbytes / 1024)::INTEGER AS memory_kb
FROM read_parquet('hf://datasets/open-r1/codeforces-submissions@~parquet/default/*/*.parquet')
WHERE verdict = 'OK' AND length(source) BETWEEN 20 AND 10000;
-- MBPP + HumanEval reference solutions, reshaped from the Hugging Face parquet
-- (same derivation as the task views). row_number() ranks only these ~1.1k
-- rows, so it doesn't serialize the big CF branch.
CREATE VIEW sols_mh_v AS
SELECT 900000000 + row_number() OVER () AS id,
task_id, code,
length(code) AS code_len,
'Python 3' AS lang,
NULL::INTEGER AS time_ms,
NULL::INTEGER AS memory_kb
FROM (
SELECT 'mbpp/' || task_id::VARCHAR AS task_id, trim(code) AS code
FROM read_parquet('hf://datasets/google-research-datasets/mbpp@~parquet/full/*/*.parquet')
WHERE code IS NOT NULL
UNION ALL
SELECT 'humaneval/' || split_part(task_id, '/', 2) AS task_id,
trim(prompt || canonical_solution) AS code
FROM read_parquet('hf://datasets/openai/openai_humaneval@~parquet/openai_humaneval/test/*.parquet')
WHERE prompt IS NOT NULL
) mh
WHERE length(code) BETWEEN 20 AND 10000;
-- Rosetta Code: per-language source, keyed back to the hash-derived task id.
CREATE VIEW sols_rc_v AS
SELECT 950000000 + row_number() OVER () AS id,
'rc/' || abs(hash(task_name)) AS task_id,
code,
length(code) AS code_len,
language_name AS lang,
NULL::INTEGER AS time_ms,
NULL::INTEGER AS memory_kb
FROM read_parquet('hf://datasets/christopher/rosetta-code@~parquet/default/train/*.parquet')
WHERE code IS NOT NULL AND length(code) BETWEEN 20 AND 10000;
CREATE VIEW solutions_v AS
SELECT * FROM sols_cf_v
UNION ALL SELECT * FROM sols_mh_v
UNION ALL SELECT * FROM sols_rc_v;
CREATE INDEX solutions_idx ON solutions_v
USING inverted(id, task_id, lang, code code_grams)
INCLUDE (id, task_id, code, code_len, lang, time_ms, memory_kb);
SELECT count(*) AS solutions_done FROM solutions_idx;