Skip to content

Athena Query Cookbook

Copy-paste SQL for common cross-source analyses. All queries run against the bioingest database.

Find all diseases associated with a protein

-- By gene name
SELECT mondo_id, mondo_name, disease_category, source, score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'EGFR'
ORDER BY score DESC;

-- By UniProt ID
SELECT mondo_id, mondo_name, source, score
FROM bioingest.unified_protein_disease
WHERE uniprot_id = 'P00533'
ORDER BY score DESC;

-- Count sources per disease for a protein (multi-source evidence)
SELECT mondo_name, disease_category, COUNT(DISTINCT source) AS n_sources, MAX(score) AS max_score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'BRCA1'
GROUP BY mondo_name, disease_category
ORDER BY n_sources DESC, max_score DESC;

Find all proteins in a pathway

-- All proteins in "Signaling by EGFR"
SELECT uniprot_id, gene_name, pathway_id, pathway_name
FROM bioingest.unified_protein_pathway
WHERE pathway_name LIKE '%Signaling by EGFR%';

-- All pathways for a protein
SELECT pathway_id, pathway_name
FROM bioingest.unified_protein_pathway
WHERE gene_name = 'TP53';

-- Pathways shared between two proteins
SELECT a.pathway_id, a.pathway_name
FROM bioingest.unified_protein_pathway a
JOIN bioingest.unified_protein_pathway b ON a.pathway_id = b.pathway_id
WHERE a.gene_name = 'EGFR' AND b.gene_name = 'ERBB2';

Cross-source evidence scoring

-- Proteins with strongest multi-source evidence for Alzheimer's
SELECT uniprot_id, gene_name,
       COUNT(DISTINCT source) AS n_sources,
       MAX(score) AS best_score,
       ARRAY_AGG(DISTINCT source) AS sources
FROM bioingest.unified_protein_disease
WHERE mondo_name LIKE '%Alzheimer%'
GROUP BY uniprot_id, gene_name
HAVING COUNT(DISTINCT source) >= 2
ORDER BY n_sources DESC, best_score DESC
LIMIT 50;

-- Compare Open Targets vs DISEASES 2.0 for a gene
SELECT source, mondo_name, score
FROM bioingest.unified_protein_disease
WHERE gene_name = 'IL6'
ORDER BY mondo_name, source;

Tissue expression lookup

-- Top tissues for a protein
SELECT tissue, tpm
FROM bioingest.unified_expression
WHERE gene_name = 'INS'
ORDER BY tpm DESC;

-- Proteins highly expressed in a tissue
SELECT uniprot_id, gene_name, tpm
FROM bioingest.unified_expression
WHERE tissue = 'Liver'
ORDER BY tpm DESC
LIMIT 20;

-- Tissue-specific proteins (expressed in one tissue, low elsewhere)
SELECT gene_name, tissue, tpm
FROM bioingest.unified_expression e1
WHERE tpm > 100
  AND NOT EXISTS (
    SELECT 1 FROM bioingest.unified_expression e2
    WHERE e2.gene_name = e1.gene_name AND e2.tissue != e1.tissue AND e2.tpm > 10
  )
ORDER BY tpm DESC;

Drug-target queries

-- All drugs targeting EGFR
SELECT chembl_id, target_type, source
FROM bioingest.unified_drug_target
WHERE gene_name = 'EGFR';

-- Drugs with known targets in a pathway
SELECT dt.gene_name, dt.chembl_id, dt.source, pp.pathway_name
FROM bioingest.unified_drug_target dt
JOIN bioingest.unified_protein_pathway pp ON dt.uniprot_id = pp.uniprot_id
WHERE pp.pathway_name LIKE '%apoptosis%';

-- TTD drug indications via drug_map
SELECT drug_name, indications, targets
FROM bioingest.drug_map
WHERE drug_name LIKE '%imatinib%';

Full protein annotation card

-- Everything known about a protein in one query
SELECT *
FROM bioingest.protein_annotations
WHERE gene_name = 'EGFR';

-- Build a manual annotation card with details
WITH target AS (SELECT * FROM bioingest.protein_map WHERE gene_name = 'EGFR')
SELECT 'IDs' AS section, uniprot_id || ' | ' || ensembl_gene_id || ' | ' || chembl_target_id AS detail FROM target
UNION ALL
SELECT 'Diseases', CAST(COUNT(*) AS VARCHAR) || ' associations' FROM bioingest.unified_protein_disease WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Interactions', CAST(COUNT(*) AS VARCHAR) || ' PPIs (STRING≄700)' FROM bioingest.unified_ppi WHERE protein1_name = 'EGFR'
UNION ALL
SELECT 'Pathways', CAST(COUNT(*) AS VARCHAR) || ' pathways' FROM bioingest.unified_protein_pathway WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Drugs', CAST(COUNT(*) AS VARCHAR) || ' drug-target links' FROM bioingest.unified_drug_target WHERE gene_name = 'EGFR'
UNION ALL
SELECT 'Expression', CAST(COUNT(*) AS VARCHAR) || ' tissues (TPM>1)' FROM bioingest.unified_expression WHERE gene_name = 'EGFR';

Interaction partners

-- Direct interaction partners of EGFR with high confidence
SELECT protein2_uniprot, protein2_name, score
FROM bioingest.unified_ppi
WHERE protein1_name = 'EGFR'
ORDER BY score DESC
LIMIT 20;

-- Shared interaction partners between two proteins
SELECT a.protein2_name, a.score AS score_with_A, b.score AS score_with_B
FROM bioingest.unified_ppi a
JOIN bioingest.unified_ppi b ON a.protein2_uniprot = b.protein2_uniprot
WHERE a.protein1_name = 'EGFR' AND b.protein1_name = 'ERBB2'
ORDER BY a.score + b.score DESC;

Cross-source record counts

SELECT 'UniProt' AS source, COUNT(*) AS records FROM bioingest.uniprot__swissprot_tsv
UNION ALL SELECT 'Open Targets targets', COUNT(*) FROM bioingest.opentargets__targets
UNION ALL SELECT 'STRING links', COUNT(*) FROM bioingest.string__protein_links
UNION ALL SELECT 'Reactome pathways', COUNT(*) FROM bioingest.reactome__uniprot2reactome_all_levels
UNION ALL SELECT 'SomaScan 11K', COUNT(*) FROM bioingest.somalogic__somascan_11k_menu
UNION ALL SELECT 'protein_map', COUNT(*) FROM bioingest.protein_map
UNION ALL SELECT 'disease_map', COUNT(*) FROM bioingest.disease_map;