codesearch/sql_solutions_mega.sql at main · serenedb/codesearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
\timing on
SET http_retries = 12;
SET http_retry_wait_ms = 5000;
DROP INDEX IF EXISTS solutions_idx;
DROP VIEW IF EXISTS solutions_v;
DROP VIEW IF EXISTS sols_cf_v;
DROP VIEW IF EXISTS sols_mh_v;
DROP VIEW IF EXISTS sols_rc_v;

-- Codeforces accepted submissions: the full `default` split (~11.4M rows),
-- read straight from Hugging Face. submission_id is a globally unique natural
-- id, so the view has no window function and the index sink builds in parallel.
CREATE VIEW sols_cf_v AS
SELECT submission_id::BIGINT AS id,
       problem_id            AS task_id,
       source                AS code,
       length(source)        AS code_len,
       programmingLanguage   AS lang,
       timeconsumedmillis::INTEGER           AS time_ms,
       (memoryconsumedbytes / 1024)::INTEGER AS memory_kb
FROM read_parquet('hf://datasets/open-r1/codeforces-submissions@~parquet/default/*/*.parquet')
WHERE verdict = 'OK' AND length(source) BETWEEN 20 AND 10000;

-- MBPP + HumanEval reference solutions, reshaped from the Hugging Face parquet
-- (same derivation as the task views). row_number() ranks only these ~1.1k
-- rows, so it doesn't serialize the big CF branch.
CREATE VIEW sols_mh_v AS
SELECT 900000000 + row_number() OVER () AS id,
       task_id, code,
       length(code)          AS code_len,
       'Python 3'            AS lang,
       NULL::INTEGER         AS time_ms,
       NULL::INTEGER         AS memory_kb
FROM (
  SELECT 'mbpp/' || task_id::VARCHAR AS task_id, trim(code) AS code
  FROM read_parquet('hf://datasets/google-research-datasets/mbpp@~parquet/full/*/*.parquet')
  WHERE code IS NOT NULL
  UNION ALL
  SELECT 'humaneval/' || split_part(task_id, '/', 2) AS task_id,
         trim(prompt || canonical_solution) AS code
  FROM read_parquet('hf://datasets/openai/openai_humaneval@~parquet/openai_humaneval/test/*.parquet')
  WHERE prompt IS NOT NULL
) mh
WHERE length(code) BETWEEN 20 AND 10000;

-- Rosetta Code: per-language source, keyed back to the hash-derived task id.
CREATE VIEW sols_rc_v AS
SELECT 950000000 + row_number() OVER () AS id,
       'rc/' || abs(hash(task_name)) AS task_id,
       code,
       length(code)          AS code_len,
       language_name         AS lang,
       NULL::INTEGER         AS time_ms,
       NULL::INTEGER         AS memory_kb
FROM read_parquet('hf://datasets/christopher/rosetta-code@~parquet/default/train/*.parquet')
WHERE code IS NOT NULL AND length(code) BETWEEN 20 AND 10000;

CREATE VIEW solutions_v AS
SELECT * FROM sols_cf_v
UNION ALL SELECT * FROM sols_mh_v
UNION ALL SELECT * FROM sols_rc_v;

CREATE INDEX solutions_idx ON solutions_v
USING inverted(id, task_id, lang, code code_grams)
INCLUDE (id, task_id, code, code_len, lang, time_ms, memory_kb);

SELECT count(*) AS solutions_done FROM solutions_idx;