Athena Query Cookbook¶
Copy-paste SQL for common cross-source analyses. All queries run against the bioingest database.
Find all diseases associated with a protein¶
-- By gene name
SELECT mondo_id, mondo_name, disease_category, source, score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'EGFR'
ORDER BY score DESC;
-- By UniProt ID
SELECT mondo_id, mondo_name, source, score
FROM bioingest.unified_protein_disease
WHERE uniprot_id = 'P00533'
ORDER BY score DESC;
-- Count sources per disease for a protein (multi-source evidence)
SELECT mondo_name, disease_category, COUNT(DISTINCT source) AS n_sources, MAX(score) AS max_score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'BRCA1'
GROUP BY mondo_name, disease_category
ORDER BY n_sources DESC, max_score DESC;
Find all proteins in a pathway¶
-- All proteins in "Signaling by EGFR"
SELECT uniprot_id, gene_name, pathway_id, pathway_name
FROM bioingest.unified_protein_pathway
WHERE pathway_name LIKE '%Signaling by EGFR%';
-- All pathways for a protein
SELECT pathway_id, pathway_name
FROM bioingest.unified_protein_pathway
WHERE gene_name = 'TP53';
-- Pathways shared between two proteins
SELECT a.pathway_id, a.pathway_name
FROM bioingest.unified_protein_pathway a
JOIN bioingest.unified_protein_pathway b ON a.pathway_id = b.pathway_id
WHERE a.gene_name = 'EGFR' AND b.gene_name = 'ERBB2';
Cross-source evidence scoring¶
-- Proteins with strongest multi-source evidence for Alzheimer's
SELECT uniprot_id, gene_name,
COUNT(DISTINCT source) AS n_sources,
MAX(score) AS best_score,
ARRAY_AGG(DISTINCT source) AS sources
FROM bioingest.unified_protein_disease
WHERE mondo_name LIKE '%Alzheimer%'
GROUP BY uniprot_id, gene_name
HAVING COUNT(DISTINCT source) >= 2
ORDER BY n_sources DESC, best_score DESC
LIMIT 50;
-- Compare Open Targets vs DISEASES 2.0 for a gene
SELECT source, mondo_name, score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'IL6'
ORDER BY mondo_name, source;
Tissue expression lookup¶
-- Top tissues for a protein
SELECT tissue, tpm
FROM bioingest.unified_expression
WHERE gene_name = 'INS'
ORDER BY tpm DESC;
-- Proteins highly expressed in a tissue
SELECT uniprot_id, gene_name, tpm
FROM bioingest.unified_expression
WHERE tissue = 'Liver'
ORDER BY tpm DESC
LIMIT 20;
-- Tissue-specific proteins (expressed in one tissue, low elsewhere)
SELECT gene_name, tissue, tpm
FROM bioingest.unified_expression e1
WHERE tpm > 100
AND NOT EXISTS (
SELECT 1 FROM bioingest.unified_expression e2
WHERE e2.gene_name = e1.gene_name AND e2.tissue != e1.tissue AND e2.tpm > 10
)
ORDER BY tpm DESC;
Drug-target queries¶
-- All drugs targeting EGFR
SELECT chembl_id, target_type, source
FROM bioingest.unified_drug_target
WHERE gene_name = 'EGFR';
-- Drugs with known targets in a pathway
SELECT dt.gene_name, dt.chembl_id, dt.source, pp.pathway_name
FROM bioingest.unified_drug_target dt
JOIN bioingest.unified_protein_pathway pp ON dt.uniprot_id = pp.uniprot_id
WHERE pp.pathway_name LIKE '%apoptosis%';
-- TTD drug indications via drug_map
SELECT drug_name, indications, targets
FROM bioingest.drug_map
WHERE drug_name LIKE '%imatinib%';
Full protein annotation card¶
-- Everything known about a protein in one query
SELECT *
FROM bioingest.protein_annotations
WHERE gene_name = 'EGFR';
-- Build a manual annotation card with details
WITH target AS (SELECT * FROM bioingest.protein_map WHERE gene_name = 'EGFR')
SELECT 'IDs' AS section, uniprot_id || ' | ' || ensembl_gene_id || ' | ' || chembl_target_id AS detail FROM target
UNION ALL
SELECT 'Diseases', CAST(COUNT(*) AS VARCHAR) || ' associations' FROM bioingest.unified_protein_disease WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Interactions', CAST(COUNT(*) AS VARCHAR) || ' PPIs (STRINGā„700)' FROM bioingest.unified_ppi WHERE protein1_name = 'EGFR'
UNION ALL
SELECT 'Pathways', CAST(COUNT(*) AS VARCHAR) || ' pathways' FROM bioingest.unified_protein_pathway WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Drugs', CAST(COUNT(*) AS VARCHAR) || ' drug-target links' FROM bioingest.unified_drug_target WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Expression', CAST(COUNT(*) AS VARCHAR) || ' tissues (TPM>1)' FROM bioingest.unified_expression WHERE gene_name = 'EGFR';
Interaction partners¶
-- Direct interaction partners of EGFR with high confidence
SELECT protein2_uniprot, protein2_name, score
FROM bioingest.unified_ppi
WHERE protein1_name = 'EGFR'
ORDER BY score DESC
LIMIT 20;
-- Shared interaction partners between two proteins
SELECT a.protein2_name, a.score AS score_with_A, b.score AS score_with_B
FROM bioingest.unified_ppi a
JOIN bioingest.unified_ppi b ON a.protein2_uniprot = b.protein2_uniprot
WHERE a.protein1_name = 'EGFR' AND b.protein1_name = 'ERBB2'
ORDER BY a.score + b.score DESC;
Cross-source record counts¶
SELECT 'UniProt' AS source, COUNT(*) AS records FROM bioingest.uniprot__swissprot_tsv
UNION ALL SELECT 'Open Targets targets', COUNT(*) FROM bioingest.opentargets__targets
UNION ALL SELECT 'STRING links', COUNT(*) FROM bioingest.string__protein_links
UNION ALL SELECT 'Reactome pathways', COUNT(*) FROM bioingest.reactome__uniprot2reactome_all_levels
UNION ALL SELECT 'SomaScan 11K', COUNT(*) FROM bioingest.somalogic__somascan_11k_menu
UNION ALL SELECT 'protein_map', COUNT(*) FROM bioingest.protein_map
UNION ALL SELECT 'disease_map', COUNT(*) FROM bioingest.disease_map;