Skip to content

Instantly share code, notes, and snippets.

@shntnu
Last active February 17, 2026 02:28
Show Gist options
  • Select an option

  • Save shntnu/f339a1a1b611dbd852a2749aaaf59a3c to your computer and use it in GitHub Desktop.

Select an option

Save shntnu/f339a1a1b611dbd852a2749aaaf59a3c to your computer and use it in GitHub Desktop.
CompoundValet CSV identifier gap analysis — run in https://shell.duckdb.org/
-- CompoundValet CSV: no structural identifiers → can't join to other databases
-- Paste into https://shell.duckdb.org/
--
-- The Drug Repurposing Hub is a similar drug-target database but includes
-- InChIKey, SMILES, and PubChem CID — making it instantly joinable.
-- CompoundValet has only drug names.
-- Load CompoundValet
CREATE TABLE cv AS
SELECT * FROM read_csv_auto(
'https://compoundvalet.ai/media/a8b962f8245436ef2226cff551216734354ee496d8e0a77036f6be37.csv'
);
-- Load Drug Repurposing Hub (similar database, but with identifiers)
CREATE TABLE rh AS
SELECT DISTINCT ON (lower(trim(pert_iname)))
pert_iname, smiles, InChIKey, pubchem_cid
FROM read_csv(
'https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_samples_20200324.txt',
delim = '\t', comment = '!', header = true
)
WHERE InChIKey IS NOT NULL AND trim(InChIKey) != '';
-- 1. CompoundValet: only has drug names — no way to join programmatically
SELECT column_name, column_type
FROM (DESCRIBE cv);
-- 2. Repurposing Hub: same kind of data, but with structural identifiers
SELECT column_name, column_type
FROM (DESCRIBE rh);
-- 3. Name-matching through Repurposing Hub as a workaround: how many match?
SELECT
(SELECT count(*) FROM cv) AS compoundvalet_total,
count(*) AS matched_by_name,
round(100.0 * count(*) / (SELECT count(*) FROM cv), 1) AS pct
FROM cv
JOIN rh ON lower(trim(cv.Drug)) = lower(trim(rh.pert_iname));
-- 4. Sample matched rows — CV annotations enriched with structural identifiers
SELECT cv.Drug, cv.Target, cv.Verdict, cv.Selectivity,
rh.InChIKey, rh.pubchem_cid, left(rh.smiles, 50) AS smiles
FROM cv
JOIN rh ON lower(trim(cv.Drug)) = lower(trim(rh.pert_iname))
WHERE cv.Verdict = 'VERIFIED' AND cv.Selectivity = 'SELECTIVE'
LIMIT 10;
-- 5. If CompoundValet added InChIKey, every row would look like that
-- — no third-party database needed as a bridge.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment