Last active
February 17, 2026 02:28
-
-
Save shntnu/f339a1a1b611dbd852a2749aaaf59a3c to your computer and use it in GitHub Desktop.
CompoundValet CSV identifier gap analysis — run in https://shell.duckdb.org/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- CompoundValet CSV: no structural identifiers → can't join to other databases | |
| -- Paste into https://shell.duckdb.org/ | |
| -- | |
| -- The Drug Repurposing Hub is a similar drug-target database but includes | |
| -- InChIKey, SMILES, and PubChem CID — making it instantly joinable. | |
| -- CompoundValet has only drug names. | |
| -- Load CompoundValet | |
| CREATE TABLE cv AS | |
| SELECT * FROM read_csv_auto( | |
| 'https://compoundvalet.ai/media/a8b962f8245436ef2226cff551216734354ee496d8e0a77036f6be37.csv' | |
| ); | |
| -- Load Drug Repurposing Hub (similar database, but with identifiers) | |
| CREATE TABLE rh AS | |
| SELECT DISTINCT ON (lower(trim(pert_iname))) | |
| pert_iname, smiles, InChIKey, pubchem_cid | |
| FROM read_csv( | |
| 'https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_samples_20200324.txt', | |
| delim = '\t', comment = '!', header = true | |
| ) | |
| WHERE InChIKey IS NOT NULL AND trim(InChIKey) != ''; | |
| -- 1. CompoundValet: only has drug names — no way to join programmatically | |
| SELECT column_name, column_type | |
| FROM (DESCRIBE cv); | |
| -- 2. Repurposing Hub: same kind of data, but with structural identifiers | |
| SELECT column_name, column_type | |
| FROM (DESCRIBE rh); | |
| -- 3. Name-matching through Repurposing Hub as a workaround: how many match? | |
| SELECT | |
| (SELECT count(*) FROM cv) AS compoundvalet_total, | |
| count(*) AS matched_by_name, | |
| round(100.0 * count(*) / (SELECT count(*) FROM cv), 1) AS pct | |
| FROM cv | |
| JOIN rh ON lower(trim(cv.Drug)) = lower(trim(rh.pert_iname)); | |
| -- 4. Sample matched rows — CV annotations enriched with structural identifiers | |
| SELECT cv.Drug, cv.Target, cv.Verdict, cv.Selectivity, | |
| rh.InChIKey, rh.pubchem_cid, left(rh.smiles, 50) AS smiles | |
| FROM cv | |
| JOIN rh ON lower(trim(cv.Drug)) = lower(trim(rh.pert_iname)) | |
| WHERE cv.Verdict = 'VERIFIED' AND cv.Selectivity = 'SELECTIVE' | |
| LIMIT 10; | |
| -- 5. If CompoundValet added InChIKey, every row would look like that | |
| -- — no third-party database needed as a bridge. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment