@article {141, title = {Association between Smoking History and Tumor Mutation Burden in Advanced Non-Small Cell Lung Cancer.}, journal = {Cancer Res}, volume = {81}, year = {2021}, month = {2021 May 01}, pages = {2566-2573}, abstract = {

Lung carcinogenesis is a complex and stepwise process involving accumulation of genetic mutations in signaling and oncogenic pathways via interactions with environmental factors and host susceptibility. Tobacco exposure is the leading cause of lung cancer, but its relationship to clinically relevant mutations and the composite tumor mutation burden (TMB) has not been fully elucidated. In this study, we investigated the dose-response relationship in a retrospective observational study of 931 patients treated for advanced-stage non-small cell lung cancer (NSCLC) between April 2013 and February 2020 at the Dana Farber Cancer Institute and Brigham and Women{\textquoteright}s Hospital. Doubling smoking pack-years was associated with increased and less frequent and mutations, whereas doubling smoking-free months was associated with more frequent . In advanced lung adenocarcinoma, doubling smoking pack-years was associated with an increase in TMB, whereas doubling smoking-free months was associated with a decrease in TMB, after controlling for age, gender, and stage. There is a significant dose-response association of smoking history with genetic alterations in cancer-related pathways and TMB in advanced lung adenocarcinoma. SIGNIFICANCE: This study clarifies the relationship between smoking history and clinically relevant mutations in non-small cell lung cancer, revealing the potential of smoking history as a surrogate for tumor mutation burden.

}, issn = {1538-7445}, doi = {10.1158/0008-5472.CAN-20-3991}, author = {Wang, Xinan and Ricciuti, Biagio and Nguyen, Tom and Li, Xihao and Rabin, Michael S and Awad, Mark M and Lin, Xihong and Johnson, Bruce E and Christiani, David C} } @article {138, title = {Comprehensive cell type decomposition of circulating cell-free DNA with CelFiE.}, journal = {Nat Commun}, volume = {12}, year = {2021}, month = {2021 05 11}, pages = {2717}, abstract = {

Circulating cell-free DNA (cfDNA) in the bloodstream originates from dying cells and is a promising noninvasive biomarker for cell death. Here, we propose an algorithm, CelFiE, to accurately estimate the relative abundances of cell types and tissues contributing to cfDNA from epigenetic cfDNA sequencing. In contrast to previous work, CelFiE accommodates low coverage data, does not require CpG site curation, and estimates contributions from multiple unknown cell types that are not available in external reference data. In simulations, CelFiE accurately estimates known and unknown cell type proportions from low coverage and noisy cfDNA mixtures, including from cell types composing less than 1\% of the total mixture. When used in two clinically-relevant situations, CelFiE correctly estimates a large placenta component in pregnant women, and an elevated skeletal muscle component in amyotrophic lateral sclerosis (ALS) patients, consistent with the occurrence of muscle wasting typical in these patients. Together, these results show how CelFiE could be a useful tool for biomarker discovery and monitoring the progression of degenerative disease.

}, keywords = {Adult, Algorithms, Amyotrophic Lateral Sclerosis, B-Lymphocytes, Biomarkers, Case-Control Studies, Cell-Free Nucleic Acids, DNA Methylation, Epigenesis, Genetic, Female, Humans, Macrophages, Male, Monocytes, Muscle, Skeletal, Neutrophils, Organ Specificity, Pregnancy, Pregnancy Trimesters, T-Lymphocytes}, issn = {2041-1723}, doi = {10.1038/s41467-021-22901-x}, author = {Caggiano, Christa and Celona, Barbara and Garton, Fleur and Mefford, Joel and Black, Brian L and Henderson, Robert and Lomen-Hoerth, Catherine and Dahl, Andrew and Zaitlen, Noah} } @article {117, title = {A deoxyribonuclease 1-like 3 genetic variant associates with asthma exacerbations.}, journal = {J Allergy Clin Immunol}, volume = {147}, year = {2021}, month = {2021 Mar}, pages = {1095-1097.e10}, issn = {1097-6825}, doi = {10.1016/j.jaci.2020.09.027}, author = {Herrera-Luis, Esther and Lorenzo-Diaz, Fabian and Samedy-Bates, Lesly-Anne and Eng, Celeste and Villar, Jes{\'u}s and Rodriguez-Santana, Jose R and Burchard, Esteban G and Pino-Yanes, Maria} } @article {130, title = {Elevated Polygenic Burden for Autism Spectrum Disorder Is Associated With the Broad Autism Phenotype in Mothers of Individuals With Autism Spectrum Disorder.}, journal = {Biol Psychiatry}, volume = {89}, year = {2021}, month = {2021 03 01}, pages = {476-485}, abstract = {

BACKGROUND: Autism spectrum disorder (ASD) is a multifactorial neurodevelopmental disorder that encompasses a complex and heterogeneous set of traits. Subclinical traits that mirror the core features of ASD, referred to as the broad autism phenotype (BAP), have been documented repeatedly in unaffected relatives and are believed to reflect underlying genetic liability to ASD. The BAP may help inform the etiology of ASD by allowing the stratification of families into more phenotypically and etiologically homogeneous subgroups. This study explores polygenic scores related to the BAP.

METHODS: Phenotypic and genotypic information were obtained from 2614 trios from the Simons Simplex Collection. Polygenic scores of ASD (ASD-PGSs) were generated across the sample to determine the shared genetic overlap between the BAP and ASD. Maternal and paternal ASD-PGSs were explored in relation to BAP traits and their child{\textquoteright}s ASD symptomatology.

RESULTS: Maternal pragmatic language was related to child{\textquoteright}s social communicative atypicalities. In fathers, rigid personality was related to increased repetitive behaviors in children. Maternal (but not paternal) ASD-PGSs were related to the pragmatic language and rigid BAP domains.

CONCLUSIONS: Associations emerged between parent and child phenotypes, with more associations emerging in mothers than in fathers. ASD-PGS associations emerged with BAP in mothers only, highlighting the potential for a female protective factor, and implicating the polygenic etiology of ASD-related phenotypes in the BAP.

}, keywords = {Autism Spectrum Disorder, Autistic Disorder, Child, Fathers, Female, Humans, Male, Mothers, Phenotype}, issn = {1873-2402}, doi = {10.1016/j.biopsych.2020.08.029}, author = {Nayar, Kritika and Sealock, Julia M and Maltman, Nell and Bush, Lauren and Cook, Edwin H and Davis, Lea K and Losh, Molly} } @article {98, title = {Genome-wide association study reveals a novel locus for asthma with severe exacerbations in diverse populations.}, journal = {Pediatr Allergy Immunol}, volume = {32}, year = {2021}, month = {2021 Jan}, pages = {106-115}, abstract = {

BACKGROUND: Severe asthma exacerbations are a major cause of asthma morbidity and increased healthcare costs. Several studies have shown racial and ethnic differences in asthma exacerbation rates. We aimed to identify genetic variants associated with severe exacerbations in two high-risk populations for asthma.

METHODS: A genome-wide association study of asthma in children and youth with severe exacerbations was performed in 1283 exacerbators and 2027 controls without asthma of Latino ancestry. Independent suggestive variants (P <= 5 {\texttimes} 10 ) were selected for replication in 448 African Americans exacerbators and 595 controls. Case-only analyses were performed comparing the exacerbators with additional 898 Latinos and 524 African Americans asthma patients without exacerbations, while adjusting by treatment category as a proxy of asthma severity. We analyzed the functionality of associated variants with in silico methods and by correlating genotypes with methylation levels in whole blood in a subset of 473 Latinos.

RESULTS: We identified two genome-wide significant associations for susceptibility to asthma with severe exacerbations, including a novel locus located at chromosome 2p21 (rs4952375, odds ratio = 1.39, P = 3.8 {\texttimes} 10 ), which was also associated with asthma exacerbations in a case-only analysis (odds ratio = 1.25, P = 1.95 {\texttimes} 10 ). This polymorphism is an expression quantitative trait locus of the long intergenic non-protein coding RNA 1913 (LINC01913) in lung tissues (P = 1.3 {\texttimes} 10 ) and influences methylation levels of the protein kinase domain-containing cytoplasmic (PKDCC) gene in whole-blood cells (P = 9.8 {\texttimes} 10 ).

CONCLUSION: We identified a novel susceptibility locus for severe asthma exacerbations in Hispanic/Latino and African American youths with functional effects in gene expression and methylation status of neighboring genes.

}, issn = {1399-3038}, doi = {10.1111/pai.13337}, author = {Herrera-Luis, Esther and Espuela-Ortiz, Antonio and Lorenzo-Diaz, Fabian and Keys, Kevin L and Mak, Angel C Y and Eng, Celeste and Huntsman, Scott and Villar, Jes{\'u}s and Rodriguez-Santana, Jose R and Burchard, Esteban G and Pino-Yanes, Maria} } @article {142, title = {Identification of rare and common regulatory variants in pluripotent cells using population-scale transcriptomics.}, journal = {Nat Genet}, volume = {53}, year = {2021}, month = {2021 03}, pages = {313-321}, abstract = {

Induced pluripotent stem cells (iPSCs) are an established cellular system to study the impact of genetic variants in derived cell types and developmental contexts. However, in their pluripotent state, the disease impact of genetic variants is less well known. Here, we integrate data from 1,367 human iPSC lines to comprehensively map common and rare regulatory variants in human pluripotent cells. Using this population-scale resource, we report hundreds of new colocalization events for human traits specific to iPSCs, and find increased power to identify rare regulatory variants compared with somatic tissues. Finally, we demonstrate how iPSCs enable the identification of causal genes for rare diseases.

}, keywords = {Bardet-Biedl Syndrome, Calcium Channels, Cell Line, Cerebellar Ataxia, DNA Methylation, Gene Expression, Genetic Variation, Humans, Induced Pluripotent Stem Cells, Polymorphism, Single Nucleotide, Proteins, Quantitative Trait Loci, Rare Diseases, Regulatory Sequences, Nucleic Acid, Sequence Analysis, RNA, Whole Genome Sequencing}, issn = {1546-1718}, doi = {10.1038/s41588-021-00800-7}, author = {Bonder, Marc Jan and Smail, Craig and Gloudemans, Michael J and Fr{\'e}sard, Laure and Jakubosky, David and D{\textquoteright}Antonio, Matteo and Li, Xin and Ferraro, Nicole M and Carcamo-Orive, Ivan and Mirauta, Bogdan and Seaton, Daniel D and Cai, Na and Vakili, Dara and Horta, Danilo and Zhao, Chunli and Zastrow, Diane B and Bonner, Devon E and Wheeler, Matthew T and Kilpinen, Helena and Knowles, Joshua W and Smith, Erin N and Frazer, Kelly A and Montgomery, Stephen B and Stegle, Oliver} } @article {136, title = {Incorporating European GWAS findings improve polygenic risk prediction accuracy of breast cancer among East Asians.}, journal = {Genet Epidemiol}, year = {2021}, month = {2021 Mar 19}, abstract = {

Previous genome-wide association studies (GWASs) have been largely focused on European (EUR) populations. However, polygenic risk scores (PRSs) derived from EUR have been shown to perform worse in non-EURs compared with EURs. In this study, we aim to improve PRS prediction in East Asians (EASs). We introduce a rescaled meta-analysis framework to combine both EUR (N = 122,175) and EAS (N = 30,801) GWAS summary statistics. To improve PRS prediction in EASs, we use a scaling factor to up-weight the EAS data, such that the resulting effect size estimates are more relevant to EASs. We then derive PRSs for EAS from the rescaled meta-analysis results of EAS and EUR data. Evaluated in an independent EAS validation data set, this approach increases the prediction liability-adjusted Nagelkerke{\textquoteright}s pseudo R by 40\%, 41\%, and 5\%, respectively, compared with PRSs derived from an EAS GWAS only, EUR GWAS only, and conventional fixed-effects meta-analysis of EAS and EUR data. The PRS derived from the rescaled meta-analysis approach achieved an area under the receiver operating characteristic curve (AUC)~of 0.6059, higher than AUC = 0.5782, 0.5809, 0.6008 for EAS, EUR, and conventional meta-analysis of EAS and EUR. We further compare PRSs constructed by single-nucleotide polymorphisms that have different linkage disequilibrium (LD) scores and minor allele frequencies (MAFs) between EUR and EAS, and observe that lower LD scores or MAF in EAS correspond to poorer PRS performance (AUC = 0.5677,~0.5530, respectively) than higher LD scores or MAF (AUC = 0.589, 0.5993, respectively). We finally build a PRS stratified by LD score differences in EUR and EAS using rescaled meta-analysis, and obtain an AUC of 0.6096, with improvement over other strategies investigated.

}, issn = {1098-2272}, doi = {10.1002/gepi.22382}, author = {Ji, Ying and Long, Jirong and Kweon, Sun-Seog and Kang, Daehee and Kubo, Michiaki and Park, Boyoung and Shu, Xiao-Ou and Zheng, Wei and Tao, Ran and Li, Bingshan} } @article {129, title = {Integration of multiomic annotation data to prioritize and characterize inflammation and immune-related risk variants in squamous cell lung cancer.}, journal = {Genet Epidemiol}, volume = {45}, year = {2021}, month = {2021 Feb}, pages = {99-114}, abstract = {

Clinical trial results have recently demonstrated that inhibiting inflammation by targeting the interleukin-1β pathway can offer a significant reduction in lung cancer incidence and mortality, highlighting a pressing and unmet need to understand the benefits of inflammation-focused lung cancer therapies at the genetic level. While numerous genome-wide association studies (GWAS) have explored the genetic etiology of lung cancer, there remains a large gap between the type of information that may be gleaned from an association study and the depth of understanding necessary to explain and drive translational findings. Thus, in this study we jointly model and integrate extensive multiomics data sources, utilizing a total of 40 genome-wide functional annotations that augment previously published results from the International Lung Cancer Consortium (ILCCO) GWAS, to prioritize and characterize single nucleotide polymorphisms (SNPs) that increase risk of squamous cell lung cancer through the inflammatory and immune responses. Our work bridges the gap between correlative analysis and translational follow-up research, refining GWAS association measures in an interpretable and systematic manner. In particular, reanalysis of the ILCCO data highlights the impact of highly associated SNPs from nuclear factor-κB signaling pathway genes as well as major histocompatibility complex mediated variation in immune responses. One consequence of prioritizing likely functional SNPs is the pruning of variants that might be selected for follow-up work by over an order of magnitude, from potentially tens of thousands to hundreds. The strategies we introduce provide informative and interpretable approaches for incorporating extensive genome-wide annotation data in analysis of genetic association studies.

}, issn = {1098-2272}, doi = {10.1002/gepi.22358}, author = {Sun, Ryan and Xu, Miao and Li, Xihao and Gaynor, Sheila and Zhou, Hufeng and Li, Zilin and Boss{\'e}, Yohan and Lam, Stephen and Tsao, Ming-Sound and Tardon, Adonina and Chen, Chu and Doherty, Jennifer and Goodman, Gary and Bojesen, Stig E and Landi, Maria T and Johansson, Mattias and Field, John K and Bickeb{\"o}ller, Heike and Wichmann, H-Erich and Risch, Angela and Rennert, Gadi and Arnold, Suzanne and Wu, Xifeng and Melander, Olle and Brunnstr{\"o}m, Hans and Le Marchand, Loic and Liu, Geoffrey and Andrew, Angeline and Duell, Eric and Kiemeney, Lambertus A and Shen, Hongbing and Haugen, Aage and Johansson, Mikael and Grankvist, Kjell and Caporaso, Neil and Woll, Penella and Dawn Teare, M and Scelo, Ghislaine and Hong, Yun-Chul and Yuan, Jian-Min and Lazarus, Philip and Schabath, Matthew B and Aldrich, Melinda C and Albanes, Demetrios and Mak, Raymond and Barbie, David and Brennan, Paul and Hung, Rayjean J and Amos, Christopher I and Christiani, David C and Lin, Xihong} } @article {92, title = {Integrative genomic analysis in African American children with asthma finds three novel loci associated with lung function.}, journal = {Genet Epidemiol}, volume = {45}, year = {2021}, month = {2021 Mar}, pages = {190-208}, abstract = {

Bronchodilator (BD) drugs are commonly prescribed for treatment and management of obstructive lung function present with diseases such as asthma. Administration of BD medication can partially or fully restore lung function as measured by pulmonary function tests. The genetics of baseline lung function measures taken before BD medication have been extensively studied, and the genetics of the BD response itself have received some attention. However, few studies have focused on the genetics of post-BD lung function. To address this gap, we analyzed lung function phenotypes in 1103 subjects from the Study of African Americans, Asthma, Genes, and Environment, a pediatric asthma case-control cohort, using an integrative genomic analysis approach that combined genotype, locus-specific genetic ancestry, and functional annotation information. We integrated genome-wide association study (GWAS) results with an admixture mapping scan of three pulmonary function tests (forced expiratory volume in 1 s [FEV ], forced vital capacity [FVC], and FEV /FVC) taken before and after albuterol BD administration on the same subjects, yielding six traits. We identified 18 GWAS loci, and five additional loci from admixture mapping, spanning several known and novel lung function candidate genes. Most loci identified via admixture mapping exhibited wide variation in minor allele frequency across genotyped global populations. Functional fine-mapping revealed an enrichment of epigenetic annotations from peripheral blood mononuclear cells, fetal lung tissue, and lung fibroblasts. Our results point to three novel potential genetic drivers of pre- and post-BD lung function: ADAMTS1, RAD54B, and EGLN3.

}, issn = {1098-2272}, doi = {10.1002/gepi.22365}, author = {Goddard, Pag{\'e} C and Keys, Kevin L and Mak, Angel C Y and Lee, Eunice Y and Liu, Amy K and Samedy-Bates, Lesly-Anne and Risse-Adams, Oona and Contreras, Mar{\'\i}a G and Elhawary, Jennifer R and Hu, Donglei and Huntsman, Scott and Oh, Sam S and Salazar, Sandra and Eng, Celeste and Himes, Blanca E and White, Marquitta J and Burchard, Esteban G} } @article {140, title = {Native American Ancestry and Air Pollution Interact to Impact Bronchodilator Response in Puerto Rican Children with Asthma.}, journal = {Ethn Dis}, volume = {31}, year = {2021}, month = {2021 Winter}, pages = {77-88}, abstract = {

Objective: Asthma is the most common chronic disease in children. Short-acting bronchodilator medications are the most commonly prescribed asthma treatment worldwide, regardless of disease severity. Puerto Rican children display the highest asthma morbidity and mortality of any US population. Alarmingly, Puerto Rican children with asthma display poor bronchodilator drug response (BDR). Reduced BDR may explain, in part, the increased asthma morbidity and mortality observed in Puerto Rican children with asthma. Gene-environment interactions may explain a portion of the heritability of BDR. We aimed to identify gene-environment interactions associated with BDR in Puerto Rican children with asthma.

Setting: Genetic, environmental, and psycho-social data from the Genes-environments and Admixture in Latino Americans (GALA II) case-control study.

Participants: Our discovery dataset consisted of 658 Puerto Rican children with asthma; our replication dataset consisted of 514 Mexican American children with asthma.

Main Outcome Measures: We assessed the association of pairwise interaction models with BDR using ViSEN (Visualization of Statistical Epistasis Networks).

Results: We identified a non-linear interaction between Native American genetic ancestry and air pollution significantly associated with BDR in Puerto Rican children with asthma. This interaction was robust to adjustment for age and sex but was not significantly associated with BDR in our replication population.

Conclusions: Decreased Native American ancestry coupled with increased air pollution exposure was associated with increased BDR in Puerto Rican children with asthma. Our study acknowledges BDR{\textquoteright}s phenotypic complexity, and emphasizes the importance of integrating social, environmental, and biological data to further our understanding of complex disease.

}, issn = {1945-0826}, doi = {10.18865/ed.31.1.77}, author = {Contreras, Mar{\'\i}a G and Keys, Kevin and Maga{\~n}a, Joaquin and Goddard, Pag{\'e} C and Risse-Adams, Oona and Zeiger, Andrew M and Mak, Angel C Y and Samedy-Bates, Lesly-Anne and Neophytou, Andreas M and Lee, Eunice and Thakur, Neeta and Elhawary, Jennifer R and Hu, Donglei and Huntsman, Scott and Eng, Celeste and Hu, Ting and Burchard, Esteban G and White, Marquitta J} } @article {143, title = {Population-scale tissue transcriptomics maps long non-coding RNAs to complex disease.}, journal = {Cell}, volume = {184}, year = {2021}, month = {2021 05 13}, pages = {2633-2648.e19}, abstract = {

Long non-coding RNA (lncRNA) genes have well-established and important impacts on molecular and cellular functions. However, among the thousands of lncRNA genes, it is still a major challenge to identify the subset with disease or trait relevance. To systematically characterize these lncRNA genes, we used Genotype Tissue Expression (GTEx) project v8 genetic and multi-tissue transcriptomic data to profile the expression, genetic regulation, cellular contexts, and trait associations of 14,100 lncRNA genes across 49 tissues for 101 distinct complex genetic traits. Using these approaches, we identified 1,432 lncRNA gene-trait associations, 800 of which were not explained by stronger effects of neighboring protein-coding genes. This included associations between lncRNA quantitative trait loci and inflammatory bowel disease, type 1 and type 2 diabetes, and coronary artery disease, as well as rare variant associations to body mass index.

}, keywords = {Coronary Artery Disease, Diabetes Mellitus, Type 1, Diabetes Mellitus, Type 2, Disease, Gene Expression Profiling, Genetic Variation, Humans, Inflammatory Bowel Diseases, Multifactorial Inheritance, Organ Specificity, Population, Quantitative Trait Loci, RNA, Long Noncoding, Transcriptome}, issn = {1097-4172}, doi = {10.1016/j.cell.2021.03.050}, author = {de Goede, Olivia M and Nachun, Daniel C and Ferraro, Nicole M and Gloudemans, Michael J and Rao, Abhiram S and Smail, Craig and Eulalio, Tiffany Y and Aguet, Francois and Ng, Bernard and Xu, Jishu and Barbeira, Alvaro N and Castel, Stephane E and Kim-Hellmuth, Sarah and Park, YoSon and Scott, Alexandra J and Strober, Benjamin J and Brown, Christopher D and Wen, Xiaoquan and Hall, Ira M and Battle, Alexis and Lappalainen, Tuuli and Im, Hae Kyung and Ardlie, Kristin G and Mostafavi, Sara and Quertermous, Thomas and Kirkegaard, Karla and Montgomery, Stephen B} } @article {133, title = {RAFFI: Accurate and fast familial relationship inference in large scale biobank studies using RaPID.}, journal = {PLoS Genet}, volume = {17}, year = {2021}, month = {2021 01}, pages = {e1009315}, abstract = {

Inference of relationships from whole-genome genetic data of a cohort is a crucial prerequisite for genome-wide association studies. Typically, relationships are inferred by computing the kinship coefficients (ϕ) and the genome-wide probability of zero IBD sharing (π0) among all pairs of individuals. Current leading methods are based on pairwise comparisons, which may not scale up to very large cohorts (e.g., sample size >1 million). Here, we propose an efficient relationship inference method, RAFFI. RAFFI leverages the efficient RaPID method to call IBD segments first, then estimate the ϕ and π0 from detected IBD segments. This inference is achieved by a data-driven approach that adjusts the estimation based on phasing quality and genotyping quality. Using simulations, we showed that RAFFI is robust against phasing/genotyping errors, admix events, and varying marker densities, and achieves higher accuracy compared to KING, the current leading method, especially for more distant relatives. When applied to the phased UK Biobank data with ~500K individuals, RAFFI is approximately 18 times faster than KING. We expect RAFFI will offer fast and accurate relatedness inference for even larger cohorts.

}, keywords = {Biological Specimen Banks, Genome, Human, Genome-Wide Association Study, Genotyping Techniques, Haplotypes, Humans, Models, Genetic, Pedigree, Polymorphism, Single Nucleotide}, issn = {1553-7404}, doi = {10.1371/journal.pgen.1009315}, author = {Naseri, Ardalan and Shi, Junjie and Lin, Xihong and Zhang, Shaojie and Zhi, Degui} } @article {127, title = {Rapid response to the alpha-1 adrenergic agent phenylephrine in the perioperative period is impacted by genomics and ancestry.}, journal = {Pharmacogenomics J}, volume = {21}, year = {2021}, month = {2021 Apr}, pages = {174-189}, abstract = {

The emergence of genomic data in biobanks and health systems offers new ways to derive medically important phenotypes, including acute phenotypes occurring during inpatient clinical care. Here we study the genetic underpinnings of the rapid response to phenylephrine, an α1-adrenergic receptor agonist commonly used to treat hypotension during anesthesia and surgery. We quantified this response by extracting blood pressure (BP) measurements 5 min before and after the administration of phenylephrine. Based on this derived phenotype, we show that systematic differences exist between self-reported ancestry groups: European-Americans (EA; n = 1387) have a significantly higher systolic response to phenylephrine than African-Americans (AA; n = 1217) and Hispanic/Latinos (HA; n = 1713) (31.3\% increase, p value < 6e-08 and 22.9\% increase, p value < 5e-05 respectively), after adjusting for genetic ancestry, demographics, and relevant clinical covariates. We performed a genome-wide association study to investigate genetic factors underlying individual differences in this derived phenotype. We discovered genome-wide significant association signals in loci and genes previously associated with BP measured in ambulatory settings, and a general enrichment of association in these genes. Finally, we discovered two low frequency variants, present at ~1\% in EAs and AAs, respectively, where patients carrying one copy of these variants show no phenylephrine response. This work demonstrates our ability to derive a quantitative phenotype suited for comparative statistics and genome-wide association studies from dense clinical and physiological measures captured for managing patients during surgery. We identify genetic variants underlying non response to phenylephrine, with implications for preemptive pharmacogenomic screening to improve safety during surgery.

}, issn = {1473-1150}, doi = {10.1038/s41397-020-00194-5}, author = {Wenric, Stephane and Jeff, Janina M and Joseph, Thomas and Yee, Muh-Ching and Belbin, Gillian M and Owusu Obeng, Aniwaa and Ellis, Stephen B and Bottinger, Erwin P and Gottesman, Omri and Levin, Matthew A and Kenny, Eimear E} } @article {137, title = {Survival Analysis on Rare Events Using Group-Regularized Multi-Response Cox Regression.}, journal = {Bioinformatics}, year = {2021}, month = {2021 Feb 09}, abstract = {

MOTIVATION: The prediction performance of Cox proportional hazard model suffers when there are only few uncensored events in the training data.

RESULTS: We propose a Sparse-Group regularized Cox regression method to improve the prediction performance of large-scale and high-dimensional survival data with few observed events. Our approach is applicable when there is one or more other survival responses that 1. has a large number of observed events; 2. share a common set of associated predictors with the rare event response. This scenario is common in the UK Biobank (Sudlow et al., 2015) dataset where records for a large number of common and less prevalent diseases of the same set of individuals are available. By analyzing these responses together, we hope to achieve higher prediction performance than when they are analyzed individually. To make this approach practical for large-scale data, we developed an accelerated proximal gradient optimization algorithm as well as a screening procedure inspired by Qian et al. (2020).

AVAILABILITY: https://github.com/rivas-lab/multisnpnet-Cox.

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.

}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btab095}, author = {Li, Ruilin and Tanigawa, Yosuke and Justesen, Johanne M and Taylor, Jonathan and Hastie, Trevor and Tibshirani, Robert and Rivas, Manuel A} } @article {118, title = {Synaptic processes and immune-related pathways implicated in Tourette syndrome.}, journal = {Transl Psychiatry}, volume = {11}, year = {2021}, month = {2021 01 18}, pages = {56}, abstract = {

Tourette syndrome (TS) is a neuropsychiatric disorder of complex genetic architecture involving multiple interacting genes. Here, we sought to elucidate the pathways that underlie the neurobiology of the disorder through genome-wide analysis. We analyzed genome-wide genotypic data of 3581 individuals with TS and 7682 ancestry-matched controls and investigated associations of TS with sets of genes that are expressed in particular cell types and operate in specific neuronal and glial functions. We employed a self-contained, set-based association method (SBA) as well as a competitive gene set method (MAGMA) using individual-level genotype data to perform a comprehensive investigation of the biological background of TS. Our SBA analysis identified three significant gene sets after Bonferroni correction, implicating ligand-gated ion channel signaling, lymphocytic, and cell adhesion and transsynaptic signaling processes. MAGMA analysis further supported the involvement of the cell adhesion and trans-synaptic signaling gene set. The lymphocytic gene set was driven by variants in FLT3, raising an intriguing hypothesis for the involvement of a neuroinflammatory element in TS pathogenesis. The indications of involvement of ligand-gated ion channel signaling reinforce the role of GABA in TS, while the association of cell adhesion and trans-synaptic signaling gene set provides additional support for the role of adhesion molecules in neuropsychiatric disorders. This study reinforces previous findings but also provides new insights into the neurobiology of TS.

}, issn = {2158-3188}, doi = {10.1038/s41398-020-01082-z}, author = {Tsetsos, Fotis and Yu, Dongmei and Sul, Jae Hoon and Huang, Alden Y and Illmann, Cornelia and Osiecki, Lisa and Darrow, Sabrina M and Hirschtritt, Matthew E and Greenberg, Erica and Muller-Vahl, Kirsten R and Stuhrmann, Manfred and Dion, Yves and Rouleau, Guy A and Aschauer, Harald and Stamenkovic, Mara and Schl{\"o}gelhofer, Monika and Sandor, Paul and Barr, Cathy L and Grados, Marco A and Singer, Harvey S and N{\"o}then, Markus M and Hebebrand, Johannes and Hinney, Anke and King, Robert A and Fernandez, Thomas V and Barta, Csaba and Tarnok, Zsanett and Nagy, Peter and Depienne, Christel and Worbe, Yulia and Hartmann, Andreas and Budman, Cathy L and Rizzo, Renata and Lyon, Gholson J and McMahon, William M and Batterson, James R and Cath, Danielle C and Malaty, Irene A and Okun, Michael S and Berlin, Cheston and Woods, Douglas W and Lee, Paul C and Jankovic, Joseph and Robertson, Mary M and Gilbert, Donald L and Brown, Lawrence W and Coffey, Barbara J and Dietrich, Andrea and Hoekstra, Pieter J and Kuperman, Samuel and Zinner, Samuel H and Wagner, Michael and Knowles, James A and Jeremy Willsey, A and Tischfield, Jay A and Heiman, Gary A and Cox, Nancy J and Freimer, Nelson B and Neale, Benjamin M and Davis, Lea K and Coppola, Giovanni and Mathews, Carol A and Scharf, Jeremiah M and Paschou, Peristera and Barr, Cathy L and Batterson, James R and Berlin, Cheston and Budman, Cathy L and Cath, Danielle C and Coppola, Giovanni and Cox, Nancy J and Darrow, Sabrina and Davis, Lea K and Dion, Yves and Freimer, Nelson B and Grados, Marco A and Greenberg, Erica and Hirschtritt, Matthew E and Huang, Alden Y and Illmann, Cornelia and King, Robert A and Kurlan, Roger and Leckman, James F and Lyon, Gholson J and Malaty, Irene A and Mathews, Carol A and McMahon, William M and Neale, Benjamin M and Okun, Michael S and Osiecki, Lisa and Robertson, Mary M and Rouleau, Guy A and Sandor, Paul and Scharf, Jeremiah M and Singer, Harvey S and Smit, Jan H and Sul, Jae Hoon and Yu, Dongmei and Aschauer, Harald Aschauer Harald and Barta, Csaba and Budman, Cathy L and Cath, Danielle C and Depienne, Christel and Hartmann, Andreas and Hebebrand, Johannes and Konstantinidis, Anastasios and Mathews, Carol A and M{\"u}ller-Vahl, Kirsten and Nagy, Peter and N{\"o}then, Markus M and Paschou, Peristera and Rizzo, Renata and Rouleau, Guy A and Sandor, Paul and Scharf, Jeremiah M and Schl{\"o}gelhofer, Monika and Stamenkovic, Mara and Stuhrmann, Manfred and Tsetsos, Fotis and Tarnok, Zsanett and Wolanczyk, Tomasz and Worbe, Yulia and Brown, Lawrence and Cheon, Keun-Ah and Coffey, Barbara J and Dietrich, Andrea and Fernandez, Thomas V and Garcia-Delgar, Blanca and Gilbert, Donald and Grice, Dorothy E and Hagstr{\o}m, Julie and Hedderly, Tammy and Heiman, Gary A and Heyman, Isobel and Hoekstra, Pieter J and Huyser, Chaim and Kim, Young Key and Kim, Young-Shin and King, Robert A and Koh, Yun-Joo and Kook, Sodahm and Kuperman, Samuel and Leventhal, Bennett L and Madruga-Garrido, Marcos and Mir, Pablo and Morer, Astrid and M{\"u}nchau, Alexander and Plessen, Kerstin J and Roessner, Veit and Shin, Eun-Young and Song, Dong-Ho and Song, Jungeun and Tischfield, Jay A and Willsey, A Jeremy and Zinner, Samuel and Aschauer, Harald and Barr, Cathy L and Barta, Csaba and Batterson, James R and Berlin, Cheston and Brown, Lawrence and Budman, Cathy L and Cath, Danielle C and Coffey, Barbara J and Coppola, Giovanni and Cox, Nancy J and Darrow, Sabrina and Davis, Lea K and Depienne, Christel and Dietrich, Andrea and Dion, Yves and Fernandez, Thomas and Freimer, Nelson B and Gilbert, Donald and Grados, Marco A and Greenberg, Erica and Hartmann, Andreas and Hebebrand, Johannes and Heiman, Gary and Hirschtritt, Matthew E and Hoekstra, Pieter and Huang, Alden Y and Illmann, Cornelia and Jankovic, Joseph and King, Robert A and Kuperman, Samuel and Lee, Paul C and Lyon, Gholson J and Malaty, Irene A and Mathews, Carol A and McMahon, William M and M{\"u}ller-Vahl, Kirsten and Nagy, Peter and Neale, Benjamin M and N{\"o}then, Markus M and Okun, Michael S and Osiecki, Lisa and Paschou, Peristera and Rizzo, Renata and Robertson, Mary M and Rouleau, Guy A and Sandor, Paul and Scharf, Jeremiah M and Schl{\"o}gelhofer, Monika and Singer, Harvey S and Stamenkovic, Mara and Stuhrmann, Manfred and Sul, Jae Hoon and Tarnok, Zsanett and Tischfield, Jay and Tsetsos, Fotis and Willsey, A Jeremy and Woods, Douglas and Worbe, Yulia and Yu, Dongmei and Zinner, Samuel} } @article {87, title = {Allelic Heterogeneity at the CRP Locus Identified by Whole-Genome Sequencing in Multi-ancestry Cohorts.}, journal = {Am J Hum Genet}, volume = {106}, year = {2020}, month = {2020 01 02}, pages = {112-120}, abstract = {

Whole-genome sequencing (WGS) can improve assessment of low-frequency and rare variants, particularly in non-European populations that have been underrepresented in existing genomic studies. The genetic determinants of C-reactive protein (CRP), a biomarker of chronic inflammation, have been extensively studied, with existing genome-wide association studies (GWASs) conducted in >200,000 individuals of European ancestry. In order to discover novel loci associated with CRP levels, we examined a multi-ancestry population (n = 23,279) with WGS (\~{}38{\texttimes} coverage) from the Trans-Omics for Precision Medicine (TOPMed) program. We found evidence for eight distinct associations at the CRP locus, including two variants that have not been identified previously (rs11265259 and rs181704186), both of which are non-coding and more common in individuals of African ancestry (\~{}10\% and \~{}1\% minor allele frequency, respectively, and rare or monomorphic in 1000 Genomes populations of East Asian, South Asian, and European ancestry). We show that the minor (G) allele of rs181704186 is associated with lower CRP levels and decreased transcriptional activity and protein binding in~vitro, providing a plausible molecular mechanism for this African ancestry-specific signal. The individuals homozygous for rs181704186-G have a mean CRP level of 0.23~mg/L, in contrast to individuals heterozygous for rs181704186 with mean CRP of 2.97~mg/L and major allele homozygotes with mean CRP of 4.11~mg/L. This study demonstrates the utility of WGS in multi-ethnic populations to drive discovery of complex trait associations of large effect and to identify functional alleles in noncoding regulatory regions.

}, keywords = {African Continental Ancestry Group, Asian Continental Ancestry Group, C-Reactive Protein, Cohort Studies, European Continental Ancestry Group, Gene Frequency, Genetic Predisposition to Disease, Genome-Wide Association Study, Humans, Linkage Disequilibrium, Polymorphism, Single Nucleotide, Whole Genome Sequencing}, issn = {1537-6605}, doi = {10.1016/j.ajhg.2019.12.002}, author = {Raffield, Laura M and Iyengar, Apoorva K and Wang, Biqi and Gaynor, Sheila M and Spracklen, Cassandra N and Zhong, Xue and Kowalski, Madeline H and Salimi, Shabnam and Polfus, Linda M and Benjamin, Emelia J and Bis, Joshua C and Bowler, Russell and Cade, Brian E and Choi, Won Jung and Comellas, Alejandro P and Correa, Adolfo and Cruz, Pedro and Doddapaneni, Harsha and Durda, Peter and Gogarten, Stephanie M and Jain, Deepti and Kim, Ryan W and Kral, Brian G and Lange, Leslie A and Larson, Martin G and Laurie, Cecelia and Lee, Jiwon and Lee, Seonwook and Lewis, Joshua P and Metcalf, Ginger A and Mitchell, Braxton D and Momin, Zeineen and Muzny, Donna M and Pankratz, Nathan and Park, Cheol Joo and Rich, Stephen S and Rotter, Jerome I and Ryan, Kathleen and Seo, Daekwan and Tracy, Russell P and Viaud-Martinez, Karine A and Yanek, Lisa R and Zhao, Lue Ping and Lin, Xihong and Li, Bingshan and Li, Yun and Dupuis, Jos{\'e}e and Reiner, Alexander P and Mohlke, Karen L and Auer, Paul L} } @article {108, title = {Analysis in case-control sequencing association studies with different sequencing depths.}, journal = {Biostatistics}, volume = {21}, year = {2020}, month = {2020 07 01}, pages = {577-593}, abstract = {

With the advent of next-generation sequencing, investigators have access to higher quality sequencing data. However, to sequence all samples in a study using next generation sequencing can still be prohibitively expensive. One potential remedy could be to combine next generation sequencing data from cases with publicly available sequencing data for controls, but there could be a systematic difference in quality of sequenced data, such as sequencing depths, between sequenced study cases and publicly available controls. We propose a regression calibration (RC)-based method and a maximum-likelihood method for conducting an association study with such a combined sample by accounting for differential sequencing errors between cases and controls. The methods allow for adjusting for covariates, such as population stratification as confounders. Both methods control type I error and have comparable power to analysis conducted using the true genotype with sufficiently high but different sequencing depths. We show that the RC method allows for analysis using naive variance estimate (closely approximates true variance in practice) and standard software under certain circumstances. We evaluate the performance of the proposed methods using simulation studies and apply our methods to a combined data set of exome sequenced acute lung injury cases and healthy controls from the 1000 Genomes project.

}, issn = {1468-4357}, doi = {10.1093/biostatistics/kxy073}, author = {Chen, Sixing and Lin, Xihong} } @article {91, title = {Assessing Digital Phenotyping to Enhance Genetic Studies of Human Diseases.}, journal = {Am J Hum Genet}, volume = {106}, year = {2020}, month = {2020 05 07}, pages = {611-622}, abstract = {

Population-scale biobanks that combine genetic data and high-dimensional phenotyping for a large number of participants provide an exciting opportunity to perform genome-wide association studies (GWAS) to identify genetic variants associated with diverse quantitative traits and diseases. A major challenge for GWAS in population biobanks is ascertaining disease cases from heterogeneous data sources such as hospital records, digital questionnaire responses, or interviews. In this study, we use genetic parameters, including genetic correlation, to evaluate whether GWAS performed using cases in the UK Biobank ascertained from hospital records, questionnaire responses, and family history of disease implicate similar disease genetics across a range of effect sizes. We find that hospital record and questionnaire GWAS largely identify similar genetic effects for many complex phenotypes and that combining together both phenotyping methods improves power to detect genetic associations. We also show that family history GWAS using cases ascertained on family history of disease agrees with combined hospital record and questionnaire GWAS and that family history GWAS has better power to detect genetic associations for some phenotypes. Overall, this work demonstrates that digital phenotyping and unstructured phenotype data can be combined with structured data such as hospital records to identify cases for GWAS in biobanks and improve the ability of such studies to identify genetic associations.

}, keywords = {Asthma, Databases, Factual, Disease, Female, Genetics, Medical, Genome-Wide Association Study, Genotype, Humans, Male, Neoplasms, Phenotype, United Kingdom}, issn = {1537-6605}, doi = {10.1016/j.ajhg.2020.03.007}, author = {DeBoever, Christopher and Tanigawa, Yosuke and Aguirre, Matthew and McInnes, Greg and Lavertu, Adam and Rivas, Manuel A} } @article {85, title = {A brief history of human disease genetics.}, journal = {Nature}, volume = {577}, year = {2020}, month = {2020 01}, pages = {179-189}, abstract = {

A primary goal of human genetics is to identify DNA sequence variants that influence biomedical traits, particularly those related to the onset and progression of human disease. Over the past 25 years, progress in realizing this objective has been transformed by advances in technology, foundational genomic resources and analytical tools, and by access to vast amounts of genotype and phenotype data. Genetic discoveries have substantially improved~our understanding of the mechanisms responsible for many rare and common diseases and driven development of novel preventative and therapeutic strategies. Medical innovation will increasingly focus on delivering care tailored to individual patterns of genetic predisposition.

}, keywords = {Animals, Genetic Testing, Genetic Variation, Genomics, Genotype, Humans, Phenotype, Rare Diseases}, issn = {1476-4687}, doi = {10.1038/s41586-019-1879-7}, author = {Claussnitzer, Melina and Cho, Judy H and Collins, Rory and Cox, Nancy J and Dermitzakis, Emmanouil T and Hurles, Matthew E and Kathiresan, Sekar and Kenny, Eimear E and Lindgren, Cecilia M and MacArthur, Daniel G and North, Kathryn N and Plon, Sharon E and Rehm, Heidi L and Risch, Neil and Rotimi, Charles N and Shendure, Jay and Soranzo, Nicole and McCarthy, Mark I} } @article {126, title = {A common variant in PNPLA3 is associated with age at diagnosis of NAFLD in patients from a multi-ethnic biobank.}, journal = {J Hepatol}, volume = {72}, year = {2020}, month = {2020 06}, pages = {1070-1081}, abstract = {

BACKGROUND \& AIMS: The Ile138Met variant (rs738409) in the PNPLA3 gene has the largest effect on non-alcoholic fatty liver disease (NAFLD), increasing the risk of progression to severe forms of liver disease. It remains unknown if the variant plays a role in age of NAFLD onset. We aimed to determine if rs738409 impacts on the age of NAFLD diagnosis.

METHODS: We applied a novel natural language processing (NLP) algorithm to a longitudinal electronic health records (EHR) dataset of >27,000 individuals with genetic data from a multi-ethnic biobank, defining NAFLD cases (n~= 1,703) and confirming controls (n~= 8,119). We conducted i) a survival analysis to determine if age at diagnosis differed by rs738409 genotype, ii) a receiver operating characteristics analysis to assess the utility of the rs738409 genotype in discriminating NAFLD cases from controls, and iii) a phenome-wide association study (PheWAS) between rs738409 and 10,095 EHR-derived disease diagnoses.

RESULTS: The PNPLA3 G risk allele was associated with: i) earlier age of NAFLD diagnosis, with the strongest effect in Hispanics (hazard ratio 1.33; 95\% CI 1.15-1.53; p <0.0001) among whom a NAFLD diagnosis was 15\% more likely in risk allele carriers vs. non-carriers; ii) increased NAFLD risk (odds ratio 1.61; 95\% CI 1.349-1.73; p <0.0001), with the strongest effect among Hispanics (odds ratio 1.43; 95\% CI 1.28-1.59; p <0.0001); iii) additional liver diseases in a PheWAS (p <4.95~{\texttimes} 10) where the risk variant also associated with earlier age of diagnosis.

CONCLUSION: Given the role of the rs738409 in NAFLD diagnosis age, our results suggest that stratifying risk within populations known to have an enhanced risk of liver disease, such as Hispanic carriers of the rs738409 variant, would be effective in earlier identification of those who would benefit most from early NAFLD prevention and treatment strategies.

LAY SUMMARY: Despite clear associations between the PNPLA3 rs738409 variant and elevated risk of progression from non-alcoholic fatty liver disease (NAFLD) to more severe forms of liver disease, it remains unknown if PNPLA3 rs738409 plays a role in the age of NAFLD onset. Herein, we found that this risk variant is associated with an earlier age of NAFLD and other liver disease diagnoses; an observation most pronounced in Hispanic Americans. We conclude that PNPLA3 rs738409 could be used to better understand liver disease risk within vulnerable populations and identify patients that may benefit from early prevention strategies.

}, issn = {1600-0641}, doi = {10.1016/j.jhep.2020.01.029}, author = {Walker, Ryan W and Belbin, Gillian M and Sorokin, Elena P and Van Vleck, Tielman and Wojcik, Genevieve L and Moscati, Arden and Gignoux, Christopher R and Cho, Judy and Abul-Husn, Noura S and Nadkarni, Girish and Kenny, Eimear E and Loos, Ruth J F} } @article {100, title = {On the cross-population generalizability of gene expression prediction models.}, journal = {PLoS Genet}, volume = {16}, year = {2020}, month = {2020 08}, pages = {e1008927}, abstract = {

The genetic control of gene expression is a core component of human physiology. For the past several years, transcriptome-wide association studies have leveraged large datasets of linked genotype and RNA sequencing information to create a powerful gene-based test of association that has been used in dozens of studies. While numerous discoveries have been made, the populations in the training data are overwhelmingly of European descent, and little is known about the generalizability of these models to other populations. Here, we test for cross-population generalizability of gene expression prediction models using a dataset of African American individuals with RNA-Seq data in whole blood. We find that the default models trained in large datasets such as GTEx and DGN fare poorly in African Americans, with a notable reduction in prediction accuracy when compared to European Americans. We replicate these limitations in cross-population generalizability using the five populations in the GEUVADIS dataset. Via realistic simulations of both populations and gene expression, we show that accurate cross-population generalizability of transcriptome prediction only arises when eQTL architecture is substantially shared across populations. In contrast, models with non-identical eQTLs showed patterns similar to real-world data. Therefore, generating RNA-Seq data in diverse populations is a critical step towards multi-ethnic utility of gene expression prediction.

}, keywords = {African Americans, Gene Expression Profiling, Genome-Wide Association Study, Humans, Models, Genetic, Quantitative Trait Loci, Reference Standards, RNA-Seq, Transcriptome}, issn = {1553-7404}, doi = {10.1371/journal.pgen.1008927}, author = {Keys, Kevin L and Mak, Angel C Y and White, Marquitta J and Eckalbar, Walter L and Dahl, Andrew W and Mefford, Joel and Mikhaylova, Anna V and Contreras, Mar{\'\i}a G and Elhawary, Jennifer R and Eng, Celeste and Hu, Donglei and Huntsman, Scott and Oh, Sam S and Salazar, Sandra and LeNoir, Michael A and Ye, Jimmie C and Thornton, Timothy A and Zaitlen, Noah and Burchard, Esteban G and Gignoux, Christopher R} } @article {99, title = {Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of large whole-genome sequencing studies at scale.}, journal = {Nat Genet}, volume = {52}, year = {2020}, month = {2020 09}, pages = {969-983}, abstract = {

Large-scale whole-genome sequencing studies have enabled the analysis of rare variants (RVs) associated with complex phenotypes. Commonly used RV association tests have limited scope to leverage variant functions. We propose STAAR (variant-set test for association using annotation information), a scalable and powerful RV association test method that effectively incorporates both variant categories and multiple complementary annotations using a dynamic weighting scheme. For the latter, we introduce {\textquoteright}annotation principal components{\textquoteright}, multidimensional summaries of in silico variant annotations. STAAR accounts for population structure and relatedness and is scalable for analyzing very large cohort and biobank whole-genome sequencing studies of continuous and dichotomous traits. We applied STAAR to identify RVs associated with four lipid traits in 12,316 discovery and 17,822 replication samples from the Trans-Omics for Precision Medicine Program. We discovered and replicated new RV associations, including disruptive missense RVs of NPC1L1 and an intergenic region near APOC1P1 associated with low-density lipoprotein cholesterol.

}, keywords = {Cholesterol, LDL, Computer Simulation, Genetic Predisposition to Disease, Genetic Variation, Genome, Genome-Wide Association Study, Humans, Models, Genetic, Molecular Sequence Annotation, Phenotype, Whole Genome Sequencing}, issn = {1546-1718}, doi = {10.1038/s41588-020-0676-4}, author = {Li, Xihao and Li, Zilin and Zhou, Hufeng and Gaynor, Sheila M and Liu, Yaowu and Chen, Han and Sun, Ryan and Dey, Rounak and Arnett, Donna K and Aslibekyan, Stella and Ballantyne, Christie M and Bielak, Lawrence F and Blangero, John and Boerwinkle, Eric and Bowden, Donald W and Broome, Jai G and Conomos, Matthew P and Correa, Adolfo and Cupples, L Adrienne and Curran, Joanne E and Freedman, Barry I and Guo, Xiuqing and Hindy, George and Irvin, Marguerite R and Kardia, Sharon L R and Kathiresan, Sekar and Khan, Alyna T and Kooperberg, Charles L and Laurie, Cathy C and Liu, X Shirley and Mahaney, Michael C and Manichaikul, Ani W and Martin, Lisa W and Mathias, Rasika A and McGarvey, Stephen T and Mitchell, Braxton D and Montasser, May E and Moore, Jill E and Morrison, Alanna C and O{\textquoteright}Connell, Jeffrey R and Palmer, Nicholette D and Pampana, Akhil and Peralta, Juan M and Peyser, Patricia A and Psaty, Bruce M and Redline, Susan and Rice, Kenneth M and Rich, Stephen S and Smith, Jennifer A and Tiwari, Hemant K and Tsai, Michael Y and Vasan, Ramachandran S and Wang, Fei Fei and Weeks, Daniel E and Weng, Zhiping and Wilson, James G and Yanek, Lisa R and Neale, Benjamin M and Sunyaev, Shamil R and Abecasis, Gon{\c c}alo R and Rotter, Jerome I and Willer, Cristen J and Peloso, Gina M and Natarajan, Pradeep and Lin, Xihong} } @article {123, title = {Efficient Estimation and Applications of Cross-Validated Genetic Predictions to Polygenic Risk Scores and Linear Mixed Models.}, journal = {J Comput Biol}, volume = {27}, year = {2020}, month = {2020 04}, pages = {599-612}, abstract = {

Large-scale cohorts with combined genetic and phenotypic data, coupled with methodological advances, have produced increasingly accurate genetic predictors of complex human phenotypes called polygenic risk scores (PRSs). In addition to the potential translational impacts of identifying at-risk individuals, PRS are being utilized for a growing list of scientific applications, including causal inference, identifying pleiotropy and genetic correlation, and powerful gene-based and mixed-model association tests. Existing PRS approaches rely on external large-scale genetic cohorts that have also measured the phenotype of interest. They further require matching on ancestry and genotyping platform or imputation quality. In this work, we present a novel reference-free method to produce a PRS that does not rely on an external cohort. We show that naive implementations of reference-free PRS either result in substantial overfitting or prohibitive increases in computational time. We show that our algorithm avoids both of these issues and can produce informative in-sample PRSs over a single cohort without overfitting. We then demonstrate several novel applications of reference-free PRSs, including detection of pleiotropy across 246 metabolic traits and efficient mixed-model association testing.

}, issn = {1557-8666}, doi = {10.1089/cmb.2019.0325}, author = {Mefford, Joel and Park, Danny and Zheng, Zhili and Ko, Arthur and Ala-Korpela, Mika and Laakso, Markku and Pajukanta, P{\"a}ivi and Yang, Jian and Witte, John and Zaitlen, Noah} } @article {82, title = {Electronic health record phenotypes associated with genetically regulated expression of CFTR and application to cystic fibrosis.}, journal = {Genet Med}, volume = {22}, year = {2020}, month = {2020 07}, pages = {1191-1200}, abstract = {

PURPOSE: The increasing use of electronic health records (EHRs) and biobanks offers unique opportunities to study Mendelian diseases. We described a novel approach to summarize clinical manifestations from patient EHRs into phenotypic evidence for cystic fibrosis (CF) with potential to alert unrecognized patients of the disease.

METHODS: We estimated genetically predicted expression (GReX) of cystic fibrosis transmembrane conductance regulator (CFTR) and tested for association with clinical diagnoses in the Vanderbilt University biobank (N = 9142 persons of European descent with 71 cases of CF). The top associated EHR phenotypes were assessed in combination as a phenotype risk score (PheRS) for discriminating CF case status in an additional 2.8 million patients from Vanderbilt University Medical Center (VUMC) and 125,305 adult patients including 25,314 CF cases from MarketScan, an independent external cohort.

RESULTS: GReX of CFTR was associated with EHR phenotypes consistent with CF. PheRS constructed using the EHR phenotypes and weights discovered by the genetic associations improved discriminative power for CF over the initially proposed PheRS in both VUMC and MarketScan.

CONCLUSION: Our study demonstrates the power of EHRs for clinical description of CF and the benefits of using a genetics-informed weighing scheme in construction of a phenotype risk score. This research may find broad applications for phenomic studies of Mendelian disease genes.

}, keywords = {Adult, Cystic Fibrosis, Cystic Fibrosis Transmembrane Conductance Regulator, Electronic Health Records, Humans, Mutation, Phenotype}, issn = {1530-0366}, doi = {10.1038/s41436-020-0786-5}, author = {Zhong, Xue and Yin, Zhijun and Jia, Gengjie and Zhou, Dan and Wei, Qiang and Faucon, Annika and Evans, Patrick and Gamazon, Eric R and Li, Bingshan and Tao, Ran and Rzhetsky, Andrey and Bastarache, Lisa and Cox, Nancy J} } @article {116, title = {Epidemiology of Functional Seizures Among Adults Treated at a University Hospital.}, journal = {JAMA Netw Open}, volume = {3}, year = {2020}, month = {2020 12 01}, pages = {e2027920}, abstract = {

Importance: Functional seizures (formerly psychogenic nonepileptic seizures), paroxysmal episodes that are often similar to epileptic seizures in their clinical presentation and display no aberrant brain electrical patterns, are understudied. Patients experience a long diagnostic delay, few treatment modalities, a high rate of comorbidities, and significant stigma due to the lack of knowledge about functional seizures.

Objective: To characterize the clinical epidemiology of a population of patients with functional seizures observed at Vanderbilt University Medical Center (VUMC).

Design, Setting, and Participants: This case-control study included patients with functional seizures identified in the VUMC electronic health record (VUMC-EHR) system from October 1989 to October 2018. Patients with epilepsy were excluded from the study and all remaining patients in the VUMC medical center system were used as controls. In total, the study included 1431 patients diagnosed with functional seizures, 2251 with epilepsy and functional seizures, 4715 with epilepsy without functional seizures, and 502 200 control patients who received treatment at VUMC for a minimum of a 3 years. Data were analyzed from November 2018 to March 2020.

Exposure: Diagnosis of functional seizures, as identified from the VUMC-EHR system by an automated phenotyping algorithm that incorporated International Classification of Diseases, Ninth Revision (ICD-9) codes, International Statistical Classification of Diseases and Related Health Problems, Tenth Revision (ICD-10) codes, Current Procedural Terminology codes, and natural language processing.

Main Outcomes and Measures: Associations of functional seizures with comorbidities and risk factors, measured in odds ratios (ORs).

Results: Of 2 346 808 total patients in the VUMC-EHR aged 18 years or older, 3341 patients with functional seizures were identified (period prevalence, 0.14\%), 1062 (74.2\%) of whom were women and for which the median (interquartile range) age was 49.3 (39.4-59.9) years. This assessment replicated previously reported associations with psychiatric disorders including posttraumatic stress disorder (PTSD) (OR, 1.22; 95\% CI, 1.21-1.24; P < 3.02 {\texttimes} 10-5), anxiety (OR, 1.14; 95\% CI, 1.13-1.15; P < 3.02 {\texttimes} 10-5), and depression (OR, 1.14; 95\% CI, 1.13-1.15; P < 3.02 {\texttimes} 10-5), and identified novel associations with cerebrovascular disease (OR, 1.08; 95\% CI, 1.06-1.09; P < 3.02 {\texttimes} 10-5). An association was found between functional seizures and the known risk factor sexual assault trauma (OR, 10.26; 95\% CI, 10.09-10.44; P < 3.02 {\texttimes} 10-5), and sexual assault trauma was found to mediate nearly a quarter of the association between female sex and functional seizures in the VUMC-EHR.

Conclusions and Relevance: This case-control study found evidence to support previously reported associations, discovered new associations between functional seizures and PTSD, anxiety, and depression. An association between cerebrovascular disease and functional seizures was also found. Results suggested that sexual trauma may be a mediating factor in the association between female sex and functional seizures.

}, keywords = {Adult, Anxiety, Case-Control Studies, Comorbidity, Delayed Diagnosis, Depression, Female, Hospitals, University, Humans, Male, Mental Disorders, Middle Aged, Odds Ratio, Risk Factors, Seizures, Stress Disorders, Post-Traumatic}, issn = {2574-3805}, doi = {10.1001/jamanetworkopen.2020.27920}, author = {Goleva, Slavina B and Lake, Allison M and Torstenson, Eric S and Haas, Kevin F and Davis, Lea K} } @article {120, title = {Evidence for secondary-variant genetic burden and non-random distribution across biological modules in a recessive ciliopathy.}, journal = {Nat Genet}, volume = {52}, year = {2020}, month = {2020 11}, pages = {1145-1150}, abstract = {

The influence of genetic background on driver mutations is well established; however, the mechanisms by which the background interacts with Mendelian loci remain unclear. We performed a systematic secondary-variant burden analysis of two independent cohorts of patients with Bardet-Biedl syndrome (BBS) with known recessive biallelic pathogenic mutations in one of 17 BBS genes for each individual. We observed a significant enrichment of trans-acting rare nonsynonymous secondary variants in patients with BBS compared with either population controls or a cohort of individuals with a non-BBS diagnosis and recessive variants in the same gene set. Strikingly, we found a significant over-representation of secondary alleles in chaperonin-encoding genes-a finding corroborated by the observation of epistatic interactions involving this complex in vivo. These data indicate a complex genetic architecture for BBS that informs the biological properties of disease modules and presents a model for secondary-variant burden analysis in recessive disorders.

}, keywords = {Alleles, Bardet-Biedl Syndrome, Cohort Studies, Exome, Genetic Variation, Humans}, issn = {1546-1718}, doi = {10.1038/s41588-020-0707-1}, author = {Kousi, Maria and S{\"o}ylemez, Onuralp and Ozanturk, Ayseg{\"u}l and Mourtzi, Niki and Akle, Sebastian and Jungreis, Irwin and Muller, Jean and Cassa, Christopher A and Brand, Harrison and Mokry, Jill Anne and Wolf, Maxim Y and Sadeghpour, Azita and McFadden, Kelsey and Lewis, Richard A and Talkowski, Michael E and Dollfus, H{\'e}l{\`e}ne and Kellis, Manolis and Davis, Erica E and Sunyaev, Shamil R and Katsanis, Nicholas} } @article {119, title = {Fast Lasso method for large-scale and ultrahigh-dimensional Cox model with applications to UK Biobank.}, journal = {Biostatistics}, year = {2020}, month = {2020 Sep 29}, abstract = {

We develop a scalable and highly efficient algorithm to fit a Cox proportional hazard model by maximizing the $L^1$-regularized (Lasso) partial likelihood function, based on the Batch Screening Iterative Lasso (BASIL) method developed in Qian and others (2019). Our algorithm is particularly suitable for large-scale and high-dimensional data that do not fit in the memory. The output of our algorithm is the full Lasso path, the parameter estimates at all predefined regularization parameters, as well as their validation accuracy measured using the concordance index (C-index) or the validation deviance. To demonstrate the effectiveness of our algorithm, we analyze a large genotype-survival time dataset across 306 disease outcomes from the UK Biobank (Sudlow and others, 2015). We provide a publicly available implementation of the proposed approach for genetics data on top of the PLINK2 package and name it snpnet-Cox.

}, issn = {1468-4357}, doi = {10.1093/biostatistics/kxaa038}, author = {Li, Ruilin and Chang, Christopher and Justesen, Johanne M and Tanigawa, Yosuke and Qiang, Junyang and Hastie, Trevor and Rivas, Manuel A and Tibshirani, Robert} } @article {81, title = {Identification of cancer driver genes based on nucleotide context.}, journal = {Nat Genet}, volume = {52}, year = {2020}, month = {2020 02}, pages = {208-218}, abstract = {

Cancer genomes contain large numbers of somatic mutations but few of these mutations drive tumor development. Current approaches either identify driver genes on the basis of mutational recurrence or approximate the functional consequences of nonsynonymous mutations by using bioinformatic scores. Passenger mutations are enriched in characteristic nucleotide contexts, whereas driver mutations occur in functional positions, which are not necessarily surrounded by a particular nucleotide context. We observed that mutations in contexts that deviate from the characteristic contexts around passenger mutations provide a signal in favor of driver genes. We therefore developed a method that combines this feature with the signals traditionally used for driver-gene identification. We applied our method to whole-exome sequencing data from 11,873 tumor-normal pairs and identified 460 driver genes that clustered into 21 cancer-related pathways. Our study provides a resource of driver genes across 28 tumor types with additional driver genes identified according to mutations in unusual nucleotide contexts.

}, keywords = {Cluster Analysis, Computational Biology, Humans, Mutation, Neoplasms, Nucleotides, Proteins, Whole Exome Sequencing}, issn = {1546-1718}, doi = {10.1038/s41588-019-0572-y}, author = {Dietlein, Felix and Weghorn, Donate and Taylor-Weiner, Amaro and Richters, Andr{\'e} and Reardon, Brendan and Liu, David and Lander, Eric S and Van Allen, Eliezer M and Sunyaev, Shamil R} } @article {74, title = {Identifying causal variants and genes using functional genomics in specialized cell types and contexts.}, journal = {Hum Genet}, volume = {139}, year = {2020}, month = {2020 Jan}, pages = {95-102}, abstract = {

A central goal in human genetics is the identification of variants and genes that influence the risk of polygenic diseases. In the past decade, genome-wide association studies (GWAS) have identified tens of thousands of genetic loci associated with various diseases. Since the majority of such loci lie within non-coding regions and have many candidate variants in linkage disequilibrium, it has been challenging to accurately identify specific causal variants and genes. To aid in their discovery a variety of statistical and experimental approaches have been developed. These approaches often borrow information from functional genomics assays such as ATAC-seq, ChIP-seq and RNA-seq to annotate functional variants and identify regulatory relationships between variants and genes. While such approaches are powerful, given the diversity of cell types and environments, it is paramount to select disease-relevant contexts for follow-up analyses. In this review, we discuss the latest developments, challenges, and best practices for determining the causal mechanisms of polygenic disease risk variants with functional genomics data from specialized cell types.

}, keywords = {Cell Lineage, Genes, Genetic Predisposition to Disease, Genome, Human, Genome-Wide Association Study, Genomics, Humans, Polymorphism, Single Nucleotide, Quantitative Trait Loci}, issn = {1432-1203}, doi = {10.1007/s00439-019-02044-2}, author = {Liu, Boxiang and Montgomery, Stephen B} } @article {95, title = {Impact of admixture and ancestry on eQTL analysis and GWAS colocalization in GTEx.}, journal = {Genome Biol}, volume = {21}, year = {2020}, month = {2020 09 11}, pages = {233}, abstract = {

BACKGROUND: Population structure among study subjects may confound genetic association studies, and lack of proper correction can lead to spurious findings. The Genotype-Tissue Expression (GTEx) project largely contains individuals of European ancestry, but the v8 release also includes up to 15\% of individuals of non-European ancestry. Assessing ancestry-based adjustments in GTEx improves portability of this research across populations and further characterizes the impact of population structure on GWAS colocalization.

RESULTS: Here, we identify a subset of 117 individuals in GTEx (v8) with a high degree of population admixture and estimate genome-wide local ancestry. We perform genome-wide cis-eQTL mapping using admixed samples in seven tissues, adjusted by either global or local ancestry. Consistent with previous work, we observe improved power with local ancestry adjustment. At loci where the two adjustments produce different lead variants, we observe 31 loci (0.02\%) where a significant colocalization is called only with one eQTL ancestry adjustment method. Notably, both adjustments produce similar numbers of significant colocalizations within each of two different colocalization methods, COLOC and FINEMAP. Finally, we identify a small subset of eQTL-associated variants highly correlated with local ancestry, providing a resource to enhance functional follow-up.

CONCLUSIONS: We provide a local ancestry map for admixed individuals in the GTEx v8 release and describe the impact of ancestry and admixture on gene expression, eQTLs, and GWAS colocalization. While the majority of the results are concordant between local and global ancestry-based adjustments, we identify distinct advantages and disadvantages to each approach.

}, issn = {1474-760X}, doi = {10.1186/s13059-020-02113-0}, author = {Gay, Nicole R and Gloudemans, Michael and Antonio, Margaret L and Abell, Nathan S and Balliu, Brunilda and Park, YoSon and Martin, Alicia R and Musharoff, Shaila and Rao, Abhiram S and Aguet, Francois and Barbeira, Alvaro N and Bonazzola, Rodrigo and Hormozdiari, Farhad and Ardlie, Kristin G and Brown, Christopher D and Im, Hae Kyung and Lappalainen, Tuuli and Wen, Xiaoquan and Montgomery, Stephen B} } @article {105, title = {Lung Function in African American Children with Asthma Is Associated with Novel Regulatory Variants of the KIT Ligand and Gene-By-Air-Pollution Interaction.}, journal = {Genetics}, volume = {215}, year = {2020}, month = {2020 07}, pages = {869-886}, abstract = {

Baseline lung function, quantified as forced expiratory volume in the first second of exhalation (FEV), is a standard diagnostic criterion used by clinicians to identify and classify lung diseases. Using whole-genome sequencing data from the National Heart, Lung, and Blood Institute Trans-Omics for Precision Medicine project, we identified a novel genetic association with FEV on chromosome 12 in 867 African American children with asthma ( = 1.26 {\texttimes} 10, β = 0.302). Conditional analysis within 1 Mb of the tag signal (rs73429450) yielded one major and two other weaker independent signals within this peak. We explored statistical and functional evidence for all variants in linkage disequilibrium with the three independent signals and yielded nine variants as the most likely candidates responsible for the association with FEV Hi-C data and expression QTL analysis demonstrated that these variants physically interacted with (KIT ligand, also known as ), and their minor alleles were associated with increased expression of the gene in nasal epithelial cells. Gene-by-air-pollution interaction analysis found that the candidate variant rs58475486 interacted with past-year ambient sulfur dioxide exposure ( = 0.003, β = 0.32). This study identified a novel protective genetic association with FEV, possibly mediated through , in African American children with asthma. This is the first study that has identified a genetic association between lung function and , which has established a role in orchestrating allergic inflammation in asthma.

}, issn = {1943-2631}, doi = {10.1534/genetics.120.303231}, author = {Mak, Angel C Y and Sajuthi, Satria and Joo, Jaehyun and Xiao, Shujie and Sleiman, Patrick M and White, Marquitta J and Lee, Eunice Y and Saef, Benjamin and Hu, Donglei and Gui, Hongsheng and Keys, Kevin L and Lurmann, Fred and Jain, Deepti and Abecasis, Goncalo and Kang, Hyun Min and Nickerson, Deborah A and Germer, Soren and Zody, Michael C and Winterkorn, Lara and Reeves, Catherine and Huntsman, Scott and Eng, Celeste and Salazar, Sandra and Oh, Sam S and Gilliland, Frank D and Chen, Zhanghua and Kumar, Rajesh and Mart{\'\i}nez, Fernando D and Wu, Ann Chen and Ziv, Elad and Hakonarson, Hakon and Himes, Blanca E and Williams, L Keoki and Seibold, Max A and Burchard, Esteban G} } @article {124, title = {Modeling epistasis in mice and yeast using the proportion of two or more distinct genetic backgrounds: Evidence for "polygenic epistasis".}, journal = {PLoS Genet}, volume = {16}, year = {2020}, month = {2020 10}, pages = {e1009165}, abstract = {

BACKGROUND: The majority of quantitative genetic models used to map complex traits assume that alleles have similar effects across all individuals. Significant evidence suggests, however, that epistatic interactions modulate the impact of many alleles. Nevertheless, identifying epistatic interactions remains computationally and statistically challenging. In this work, we address some of these challenges by developing a statistical test for polygenic epistasis that determines whether the effect of an allele is altered by the global genetic ancestry proportion from distinct progenitors.

RESULTS: We applied our method to data from mice and yeast. For the mice, we observed 49 significant genotype-by-ancestry interaction associations across 14 phenotypes as well as over 1,400 Bonferroni-corrected genotype-by-ancestry interaction associations for mouse gene expression data. For the yeast, we observed 92 significant genotype-by-ancestry interactions across 38 phenotypes. Given this evidence of epistasis, we test for and observe evidence of rapid selection pressure on ancestry specific polymorphisms within one of the cohorts, consistent with epistatic selection.

CONCLUSIONS: Unlike our prior work in human populations, we observe widespread evidence of ancestry-modified SNP effects, perhaps reflecting the greater divergence present in crosses using mice and yeast.

}, keywords = {Alleles, Animals, Epistasis, Genetic, Evolution, Molecular, Genotype, Humans, Mice, Models, Genetic, Multifactorial Inheritance, Phenotype, Quantitative Trait Loci, Saccharomyces cerevisiae, Selection, Genetic}, issn = {1553-7404}, doi = {10.1371/journal.pgen.1009165}, author = {Rau, Christoph D and Gonzales, Natalia M and Bloom, Joshua S and Park, Danny and Ayroles, Julien and Palmer, Abraham A and Lusis, Aldons J and Zaitlen, Noah} } @article {103, title = {Multi-Ethnic Genome-Wide Association Study of Decomposed Cardioelectric Phenotypes Illustrates Strategies to Identify and Characterize Evidence of Shared Genetic Effects for Complex Traits.}, journal = {Circ Genom Precis Med}, volume = {13}, year = {2020}, month = {2020 08}, pages = {e002680}, abstract = {

BACKGROUND: We examined how expanding electrocardiographic trait genome-wide association studies to include ancestrally diverse populations, prioritize more precise phenotypic measures, and evaluate evidence for shared genetic effects enabled the detection and characterization of loci.

METHODS: We decomposed 10 seconds, 12-lead electrocardiograms from 34 668 multi-ethnic participants (15\% Black; 30\% Hispanic/Latino) into 6 contiguous, physiologically distinct (P wave, PR segment, QRS interval, ST segment, T wave, and TP segment) and 2 composite, conventional (PR interval and QT interval) interval scale traits and conducted multivariable-adjusted, trait-specific univariate genome-wide association studies using 1000-G imputed single-nucleotide polymorphisms. Evidence of shared genetic effects was evaluated by aggregating meta-analyzed univariate results across the 6 continuous electrocardiographic traits using the combined phenotype adaptive sum of powered scores test.

RESULTS: We identified 6 novels (, and ) and 87 known loci (adaptive sum of powered score test <5{\texttimes}10). Lead single-nucleotide polymorphism rs3211938 at was common in Blacks (minor allele frequency=10\%), near monomorphic in European Americans, and had effects on the QT interval and TP segment that ranked among the largest reported to date for common variants. The other 5 novel loci were observed when evaluating the contiguous but not the composite electrocardiographic traits. Combined phenotype testing did not identify novel electrocardiographic loci unapparent using traditional univariate approaches, although this approach did assist with the characterization of known loci.

CONCLUSIONS: Despite including one-third as many participants as published electrocardiographic trait genome-wide association studies, our study identified 6 novel loci, emphasizing the importance of ancestral diversity and phenotype resolution in this era of ever-growing genome-wide association studies.

}, issn = {2574-8300}, doi = {10.1161/CIRCGEN.119.002680}, author = {Baldassari, Antoine R and Sitlani, Colleen M and Highland, Heather M and Arking, Dan E and Buyske, Steve and Darbar, Dawood and Gondalia, Rahul and Graff, Misa and Guo, Xiuqing and Heckbert, Susan R and Hindorff, Lucia A and Hodonsky, Chani J and Ida Chen, Yii-Der and Kaplan, Robert C and Peters, Ulrike and Post, Wendy and Reiner, Alex P and Rotter, Jerome I and Shohet, Ralph V and Seyerle, Amanda A and Sotoodehnia, Nona and Tao, Ran and Taylor, Kent D and Wojcik, Genevieve L and Yao, Jie and Kenny, Eimear E and Lin, Henry J and Soliman, Elsayed Z and Whitsel, Eric A and North, Kari E and Kooperberg, Charles and Avery, Christy L} } @article {125, title = {Operating characteristics of the rank-based inverse normal transformation for quantitative trait analysis in genome-wide association studies.}, journal = {Biometrics}, volume = {76}, year = {2020}, month = {2020 12}, pages = {1262-1272}, abstract = {

Quantitative traits analyzed in Genome-Wide Association Studies (GWAS) are often nonnormally distributed. For such traits, association tests based on standard linear regression are subject to reduced power and inflated type I error in finite samples. Applying the rank-based inverse normal transformation (INT) to nonnormally distributed traits has become common practice in GWAS. However, the different variations on INT-based association testing have not been formally defined, and guidance is lacking on when to use which approach. In this paper, we formally define and systematically compare the direct (D-INT) and indirect (I-INT) INT-based association tests. We discuss their assumptions, underlying generative models, and connections. We demonstrate that the relative powers of D-INT and I-INT depend on the underlying data generating process. Since neither approach is uniformly most powerful, we combine them into an adaptive omnibus test (O-INT). O-INT is robust to model misspecification, protects the type I error, and is well powered against a wide range of nonnormally distributed traits. Extensive simulations were conducted to examine the finite sample operating characteristics of these tests. Our results demonstrate that, for nonnormally distributed traits, INT-based tests outperform the standard untransformed association test, both in terms of power and type I error rate control. We apply the proposed methods to GWAS of spirometry traits in the UK Biobank. O-INT has been implemented in the R package RNOmni, which is available on~CRAN.

}, issn = {1541-0420}, doi = {10.1111/biom.13214}, author = {McCaw, Zachary R and Lane, Jacqueline M and Saxena, Richa and Redline, Susan and Lin, Xihong} } @article {76, title = {Phenome-based approach identifies RIC1-linked Mendelian syndrome through zebrafish models, biobank associations and clinical studies.}, journal = {Nat Med}, volume = {26}, year = {2020}, month = {2020 01}, pages = {98-109}, abstract = {

Discovery of genotype-phenotype relationships remains a major challenge in clinical medicine. Here, we combined three sources of phenotypic data to uncover a new mechanism for rare and common diseases resulting from collagen secretion deficits. Using a zebrafish genetic screen, we identified the ric1 gene as being essential for skeletal biology. Using a gene-based phenome-wide association study (PheWAS) in the EHR-linked BioVU biobank, we show that reduced genetically determined expression of RIC1 is associated with musculoskeletal and dental conditions. Whole-exome sequencing identified individuals homozygous-by-descent for a rare variant in RIC1 and, through a guided clinical re-evaluation, it was discovered that they share signs with the BioVU-associated phenome. We named this new Mendelian syndrome CATIFA (cleft lip, cataract, tooth abnormality, intellectual disability, facial dysmorphism, attention-deficit hyperactivity disorder) and revealed further disease mechanisms. This gene-based, PheWAS-guided approach can accelerate the discovery of clinically relevant disease phenome and associated biological mechanisms.

}, keywords = {Abnormalities, Multiple, Animals, Behavior, Animal, Biological Specimen Banks, Chondrocytes, Disease Models, Animal, Extracellular Matrix, Fibroblasts, Guanine Nucleotide Exchange Factors, Humans, Models, Biological, Musculoskeletal System, Osteogenesis, Phenomics, Phenotype, Procollagen, Protein Transport, Secretory Pathway, Syndrome, Zebrafish, Zebrafish Proteins}, issn = {1546-170X}, doi = {10.1038/s41591-019-0705-y}, author = {Unlu, Gokhan and Qi, Xinzi and Gamazon, Eric R and Melville, David B and Patel, Nisha and Rushing, Amy R and Hashem, Mais and Al-Faifi, Abdullah and Chen, Rui and Li, Bingshan and Cox, Nancy J and Alkuraya, Fowzan S and Knapik, Ela W} } @article {89, title = {A positively selected FBN1 missense variant reduces height in Peruvian individuals.}, journal = {Nature}, volume = {582}, year = {2020}, month = {2020 06}, pages = {234-239}, abstract = {

On average, Peruvian individuals are among the shortest in the world. Here we show that Native American ancestry is associated with reduced height in an ethnically diverse group of Peruvian individuals, and identify a population-specific, missense variant in the FBN1 gene (E1297G) that is significantly associated with lower height. Each copy of the minor allele (frequency of 4.7\%) reduces height by 2.2~cm (4.4~cm in homozygous individuals). To our knowledge, this is the largest effect size known for a common height-associated variant. FBN1 encodes the extracellular matrix protein fibrillin~1, which is a major structural component of microfibrils. We observed less densely packed fibrillin-1-rich microfibrils with irregular edges in the skin of individuals who were homozygous for G1297 compared with individuals who were homozygous for E1297. Moreover, we show that the E1297G locus is under positive selection in non-African populations, and that the E1297 variant shows subtle evidence of positive selection specifically within the Peruvian population. This variant is also significantly more frequent in coastal Peruvian populations than in populations from the Andes or the Amazon, which suggests that short stature might be the result of adaptation to factors that are associated with the coastal environment in Peru.

}, keywords = {Body Height, Female, Fibrillin-1, Gene Frequency, Genome-Wide Association Study, Heredity, Humans, Indians, South American, Male, Microfibrils, Mutation, Missense, Peru, Selection, Genetic}, issn = {1476-4687}, doi = {10.1038/s41586-020-2302-0}, author = {Asgari, Samira and Luo, Yang and Akbari, Ali and Belbin, Gillian M and Li, Xinyi and Harris, Daniel N and Selig, Martin and Bartell, Eric and Calderon, Roger and Slowikowski, Kamil and Contreras, Carmen and Yataco, Rosa and Galea, Jerome T and Jimenez, Judith and Coit, Julia M and Farro{\~n}ay, Chandel and Nazarian, Rosalynn M and O{\textquoteright}Connor, Timothy D and Dietz, Harry C and Hirschhorn, Joel N and Guio, Heinner and Lecca, Leonid and Kenny, Eimear E and Freeman, Esther E and Murray, Megan B and Raychaudhuri, Soumya} } @article {97, title = {PSCAN: Spatial scan tests guided by protein structures improve complex disease gene discovery and signal variant detection.}, journal = {Genome Biol}, volume = {21}, year = {2020}, month = {2020 08 26}, pages = {217}, abstract = {

Germline disease-causing variants are generally more spatially clustered in protein 3-dimensional structures than benign variants. Motivated by this tendency, we develop a fast and powerful protein-structure-based scan (PSCAN) approach for evaluating gene-level associations with complex disease and detecting signal variants. We validate PSCAN{\textquoteright}s performance on synthetic data and two real data sets for lipid traits and Alzheimer{\textquoteright}s disease. Our results demonstrate that PSCAN performs competitively with existing gene-level tests while increasing power and identifying more specific signal variant sets. Furthermore, PSCAN enables generation of hypotheses about the molecular basis for the associations in the context of protein structures and functional domains.

}, issn = {1474-760X}, doi = {10.1186/s13059-020-02121-0}, author = {Tang, Zheng-Zheng and Sliwoski, Gregory R and Chen, Guanhua and Jin, Bowen and Bush, William S and Li, Bingshan and Capra, John A} } @article {90, title = {Rare protein-altering variants in ANGPTL7 lower intraocular pressure and protect against glaucoma.}, journal = {PLoS Genet}, volume = {16}, year = {2020}, month = {2020 05}, pages = {e1008682}, abstract = {

Protein-altering variants that are protective against human disease provide in vivo validation of therapeutic targets. Here we use genotyping data from UK Biobank (n = 337,151 unrelated White British individuals) and FinnGen (n = 176,899) to conduct a search for protein-altering variants conferring lower intraocular pressure (IOP) and protection against glaucoma. Through rare protein-altering variant association analysis, we find a missense variant in ANGPTL7 in UK Biobank (rs28991009, p.Gln175His, MAF = 0.8\%, genotyped in 82,253 individuals with measured IOP and an independent set of 4,238 glaucoma patients and 250,660 controls) that significantly lowers IOP (β = -0.53 and -0.67 mmHg for heterozygotes, -3.40 and -2.37 mmHg for homozygotes, P = 5.96 x 10-9 and 1.07 x 10-13 for corneal compensated and Goldman-correlated IOP, respectively) and is associated with 34\% reduced risk of glaucoma (P = 0.0062). In FinnGen, we identify an ANGPTL7 missense variant at a greater than 50-fold increased frequency in Finland compared with other populations (rs147660927, p.Arg220Cys, MAF Finland = 4.3\%), which was genotyped in 6,537 glaucoma patients and 170,362 controls and is associated with a 29\% lower glaucoma risk (P = 1.9 x 10-12 for all glaucoma types and also protection against its subtypes including exfoliation, primary open-angle, and primary angle-closure). We further find three rarer variants in UK Biobank, including a protein-truncating variant, which confer a strong composite lowering of IOP (P = 0.0012 and 0.24 for Goldman-correlated and corneal compensated IOP, respectively), suggesting the protective mechanism likely resides in the loss of interaction or function. Our results support inhibition or down-regulation of ANGPTL7 as a therapeutic strategy for glaucoma.

}, keywords = {Adult, Aged, Aged, 80 and over, Angiopoietin-like Proteins, Biological Specimen Banks, Case-Control Studies, Cohort Studies, Female, Finland, Gene Frequency, Genetic Predisposition to Disease, Genetics, Population, Genome-Wide Association Study, Glaucoma, Humans, Intraocular Pressure, Loss of Function Mutation, Male, Middle Aged, Mutation, Missense, Polymorphism, Single Nucleotide, United Kingdom}, issn = {1553-7404}, doi = {10.1371/journal.pgen.1008682}, author = {Tanigawa, Yosuke and Wainberg, Michael and Karjalainen, Juha and Kiiskinen, Tuomo and Venkataraman, Guhan and Lemmel{\"a}, Susanna and Turunen, Joni A and Graham, Robert R and Havulinna, Aki S and Perola, Markus and Palotie, Aarno and Daly, Mark J and Rivas, Manuel A} } @article {110, title = {A Transcriptome-Wide Association Study Identifies Candidate Susceptibility Genes for Pancreatic Cancer Risk.}, journal = {Cancer Res}, volume = {80}, year = {2020}, month = {2020 10 15}, pages = {4346-4354}, abstract = {

Pancreatic cancer is among the most well-characterized cancer types, yet a large proportion of the heritability of pancreatic cancer risk remains unclear. Here, we performed a large transcriptome-wide association study to systematically investigate associations between genetically predicted gene expression in normal pancreas tissue and pancreatic cancer risk. Using data from 305 subjects of mostly European descent in the Genotype-Tissue Expression Project, we built comprehensive genetic models to predict normal pancreas tissue gene expression, modifying the UTMOST (unified test for molecular signatures). These prediction models were applied to the genetic data of 8,275 pancreatic cancer cases and 6,723 controls of European ancestry. Thirteen genes showed an association of genetically predicted expression with pancreatic cancer risk at an FDR <= 0.05, including seven previously reported genes (, and ) and six novel genes not yet reported for pancreatic cancer risk [6q27: OR (95\% confidence interval (CI), 1.54 (1.25-1.89); 13q12.13: OR (95\% CI), 0.78 (0.70-0.88); 14q24.3: OR (95\% CI), 1.35 (1.17-1.56); 17q12: OR (95\% CI), 6.49 (2.96-14.27); 17q21.1: OR (95\% CI), 1.94 (1.45-2.58); and 20p13: OR (95\% CI): 1.41 (1.20-1.66)]. The associations for 10 of these genes (, and ) remained statistically significant even after adjusting for risk SNPs identified in previous genome-wide association study. Collectively, this analysis identified novel candidate susceptibility genes for pancreatic cancer that warrant further investigation. SIGNIFICANCE: A transcriptome-wide association analysis identified seven previously reported and six novel candidate susceptibility genes for pancreatic cancer risk.

}, keywords = {Age Factors, Case-Control Studies, European Continental Ancestry Group, Female, Gene Expression Regulation, Neoplastic, Genetic Predisposition to Disease, Genome-Wide Association Study, Humans, Male, Models, Genetic, Pancreatic Neoplasms, Polymorphism, Single Nucleotide}, issn = {1538-7445}, doi = {10.1158/0008-5472.CAN-20-1353}, author = {Liu, Duo and Zhou, Dan and Sun, Yanfa and Zhu, Jingjing and Ghoneim, Dalia and Wu, Chong and Yao, Qizhi and Gamazon, Eric R and Cox, Nancy J and Wu, Lang} } @article {94, title = {Transcriptomic signatures across human tissues identify functional rare genetic variation.}, journal = {Science}, volume = {369}, year = {2020}, month = {2020 09 11}, abstract = {

Rare genetic variants are abundant across the human genome, and identifying their function and phenotypic impact is a major challenge. Measuring aberrant gene expression has aided in identifying functional, large-effect rare variants (RVs). Here, we expanded detection of genetically driven transcriptome abnormalities by analyzing gene expression, allele-specific expression, and alternative splicing from multitissue RNA-sequencing data, and demonstrate that each signal informs unique classes of RVs. We developed Watershed, a probabilistic model that integrates multiple genomic and transcriptomic signals to predict variant function, validated these predictions in additional cohorts and through experimental assays, and used them to assess RVs in the UK Biobank, the Million Veterans Program, and the Jackson Heart Study. Our results link thousands of RVs to diverse molecular effects and provide evidence to associate RVs affecting the transcriptome with human traits.

}, keywords = {Genetic Variation, Genome, Human, Humans, Multifactorial Inheritance, Organ Specificity, Transcriptome}, issn = {1095-9203}, doi = {10.1126/science.aaz5900}, author = {Ferraro, Nicole M and Strober, Benjamin J and Einson, Jonah and Abell, Nathan S and Aguet, Francois and Barbeira, Alvaro N and Brandt, Margot and Bucan, Maja and Castel, Stephane E and Davis, Joe R and Greenwald, Emily and Hess, Gaelen T and Hilliard, Austin T and Kember, Rachel L and Kotis, Bence and Park, YoSon and Peloso, Gina and Ramdas, Shweta and Scott, Alexandra J and Smail, Craig and Tsang, Emily K and Zekavat, Seyedeh M and Ziosi, Marcello and Ardlie, Kristin G and Assimes, Themistocles L and Bassik, Michael C and Brown, Christopher D and Correa, Adolfo and Hall, Ira and Im, Hae Kyung and Li, Xin and Natarajan, Pradeep and Lappalainen, Tuuli and Mohammadi, Pejman and Montgomery, Stephen B and Battle, Alexis} } @article {109, title = {Type 2 and interferon inflammation regulate SARS-CoV-2 entry factor expression in the airway epithelium.}, journal = {Nat Commun}, volume = {11}, year = {2020}, month = {2020 10 12}, pages = {5139}, abstract = {

Coronavirus disease 2019 (COVID-19) is caused by SARS-CoV-2, an emerging virus that utilizes host proteins ACE2 and TMPRSS2 as entry factors. Understanding the factors affecting the pattern and levels of expression of these genes is important for deeper understanding of SARS-CoV-2 tropism and pathogenesis. Here we explore the role of genetics and co-expression networks in regulating these genes in the airway, through the analysis of nasal airway transcriptome data from 695 children. We identify expression quantitative trait loci for both ACE2 and TMPRSS2, that vary in frequency across world populations. We find TMPRSS2 is part of a mucus secretory network, highly upregulated by type 2 (T2) inflammation through the action of interleukin-13, and that the interferon response to respiratory viruses highly upregulates ACE2 expression. IL-13 and virus infection mediated effects on ACE2 expression were also observed at the protein level in the airway epithelium. Finally, we define airway responses to common coronavirus infections in children, finding that these infections generate host responses similar to other viral species, including upregulation of IL6 and ACE2. Our results reveal possible mechanisms influencing SARS-CoV-2 infectivity and COVID-19 clinical outcomes.

}, keywords = {Angiotensin-Converting Enzyme 2, Betacoronavirus, Child, Coronavirus Infections, COVID-19, Epithelial Cells, Gene Expression Profiling, Gene Expression Regulation, Genetic Variation, Host-Pathogen Interactions, Humans, Inflammation, Interferons, Interleukin-13, Middle Aged, Nasal Mucosa, Pandemics, Peptidyl-Dipeptidase A, Pneumonia, Viral, SARS-CoV-2, Serine Endopeptidases, Virus Internalization}, issn = {2041-1723}, doi = {10.1038/s41467-020-18781-2}, author = {Sajuthi, Satria P and DeFord, Peter and Li, Yingchun and Jackson, Nathan D and Montgomery, Michael T and Everman, Jamie L and Rios, Cydney L and Pruesse, Elmar and Nolin, James D and Plender, Elizabeth G and Wechsler, Michael E and Mak, Angel C Y and Eng, Celeste and Salazar, Sandra and Medina, Vivian and Wohlford, Eric M and Huntsman, Scott and Nickerson, Deborah A and Germer, Soren and Zody, Michael C and Abecasis, Goncalo and Kang, Hyun Min and Rice, Kenneth M and Kumar, Rajesh and Oh, Sam and Rodriguez-Santana, Jose and Burchard, Esteban G and Seibold, Max A} } @article {111, title = {A unified framework for joint-tissue transcriptome-wide association and Mendelian randomization analysis.}, journal = {Nat Genet}, volume = {52}, year = {2020}, month = {2020 11}, pages = {1239-1246}, abstract = {

Here, we present a joint-tissue imputation (JTI) approach and a Mendelian randomization framework for causal inference, MR-JTI. JTI borrows information across transcriptomes of different tissues, leveraging shared genetic regulation, to improve prediction performance in a tissue-dependent manner. Notably, JTI includes the single-tissue imputation method PrediXcan as a special case and outperforms other single-tissue approaches (the Bayesian sparse linear mixed model and Dirichlet process regression). MR-JTI models variant-level heterogeneity (primarily due to horizontal pleiotropy, addressing a major challenge of transcriptome-wide association study interpretation) and performs causal inference with type I error control. We make explicit the connection between the genetic architecture of gene expression and of complex traits and the suitability of Mendelian randomization as a causal inference strategy for transcriptome-wide association studies. We provide a resource of imputation models generated from GTEx and PsychENCODE panels. Analysis of biobanks and meta-analysis data, and extensive simulations show substantially improved statistical power, replication and causal mapping rate for JTI relative to existing approaches.

}, keywords = {Animals, Gene Expression Profiling, Genetic Association Studies, Humans, Lipoproteins, LDL, Mendelian Randomization Analysis, Mice, Models, Genetic, Multifactorial Inheritance, Predictive Value of Tests}, issn = {1546-1718}, doi = {10.1038/s41588-020-0706-2}, author = {Zhou, Dan and Jiang, Yi and Zhong, Xue and Cox, Nancy J and Liu, Chunyu and Gamazon, Eric R} } @article {101, title = {Whole-exome sequencing in adult patients with developmental and epileptic encephalopathy: It is never too late.}, journal = {Clin Genet}, volume = {98}, year = {2020}, month = {2020 11}, pages = {477-485}, abstract = {

Developmental and epileptic encephalopathies (DEE) encompass rare, sporadic neurodevelopmental disorders and usually with pediatric onset. As these conditions are characterized by marked clinical and genetic heterogeneity, whole-exome sequencing (WES) represents the strategy of choice for the molecular diagnosis. While its usefulness is well established in pediatric DEE cohorts, our study is aimed at assessing the WES feasibility in adult DEE patients who experienced a diagnostic odyssey prior to the advent of this technique. We analyzed exomes from 71 unrelated adult DEE patients, consecutively recruited from an Italian cohort for the EPI25 Project. All patients underwent accurate clinical and electrophysiological characterization. An overwhelming percentage (90.1\%) had already undergone negative genetic testing. Variants were classified according to the American College of Medical Genetics and Genomics guidelines. WES disclosed 24 (likely) pathogenic variants among 18 patients in epilepsy-related genes with either autosomal dominant, recessive or X-linked inheritance. Ten of these were novel. We obtained a diagnostic yield of 25.3\%, higher among patients with brain malformations, early-onset epilepsy and dysmorphisms. Despite a median diagnostic delay of 38.7 years, WES analysis provided the long-awaited diagnosis for 18 adult patients, which also had an impact on the clinical management of 50\% of them.

}, issn = {1399-0004}, doi = {10.1111/cge.13823}, author = {Minardi, Raffaella and Licchetta, Laura and Baroni, Maria Chiara and Pippucci, Tommaso and Stipa, Carlotta and Mostacci, Barbara and Severi, Giulia and Toni, Francesco and Bergonzini, Luca and Carelli, Valerio and Seri, Marco and Tinuper, Paolo and Bisulli, Francesca} } @article {88, title = {Whole-Genome and RNA Sequencing Reveal Variation and Transcriptomic Coordination in the Developing Human Prefrontal Cortex.}, journal = {Cell Rep}, volume = {31}, year = {2020}, month = {2020 04 07}, pages = {107489}, abstract = {

Gene expression levels vary across developmental stage, cell type, and region in the brain. Genomic variants also contribute to the variation in expression, and some neuropsychiatric disorder loci may exert their effects through this mechanism. To investigate these relationships, we present BrainVar, a unique resource of paired whole-genome and bulk tissue RNA sequencing from the dorsolateral prefrontal cortex of 176 individuals across prenatal and postnatal development. Here we identify common variants that alter gene expression (expression quantitative trait loci [eQTLs]) constantly across development or predominantly during prenatal or postnatal stages. Both "constant" and "temporal-predominant" eQTLs are enriched for loci associated with neuropsychiatric traits and disorders and colocalize with specific variants. Expression levels of more than 12,000 genes rise or fall in a concerted late-fetal transition, with the transitional genes enriched for cell-type-specific genes and neuropsychiatric risk loci, underscoring the importance of cataloging developmental trajectories in understanding cortical physiology and pathology.

}, keywords = {Base Sequence, Brain, Computational Biology, Databases, Genetic, Genetic Predisposition to Disease, Genetic Variation, Genome-Wide Association Study, Genomics, Humans, Phenotype, Polymorphism, Single Nucleotide, Prefrontal Cortex, Quantitative Trait Loci, Sequence Analysis, RNA, Transcriptome, Whole Exome Sequencing, Whole Genome Sequencing}, issn = {2211-1247}, doi = {10.1016/j.celrep.2020.03.053}, author = {Werling, Donna M and Pochareddy, Sirisha and Choi, Jinmyung and An, Joon-Yong and Sheppard, Brooke and Peng, Minshi and Li, Zhen and Dastmalchi, Claudia and Santpere, Gabriel and Sousa, Andr{\'e} M M and Tebbenkamp, Andrew T N and Kaur, Navjot and Gulden, Forrest O and Breen, Michael S and Liang, Lindsay and Gilson, Michael C and Zhao, Xuefang and Dong, Shan and Klei, Lambertus and Cicek, A Ercument and Buxbaum, Joseph D and Adle-Biassette, Homa and Thomas, Jean-Leon and Aldinger, Kimberly A and O{\textquoteright}Day, Diana R and Glass, Ian A and Zaitlen, Noah A and Talkowski, Michael E and Roeder, Kathryn and State, Matthew W and Devlin, Bernie and Sanders, Stephan J and Sestan, Nenad} } @article {54, title = {ACAT: A Fast and Powerful p Value Combination Method for Rare-Variant Analysis in Sequencing Studies.}, journal = {Am J Hum Genet}, volume = {104}, year = {2019}, month = {2019 Mar 07}, pages = {410-421}, abstract = {

Set-based analysis that jointly tests the association of variants in a group has emerged as a popular tool for analyzing rare and low-frequency variants in sequencing studies. The existing set-based tests can suffer significant power loss when only a small proportion of variants are causal, and their powers can be sensitive to the number, effect sizes, and effect directions of the causal variants and the choices of weights. Here we propose an aggregated Cauchy association test (ACAT), a general, powerful, and computationally efficient p value combination method for boosting power in sequencing studies. First, by combining variant-level p values, we use ACAT to construct a set-based test (ACAT-V) that is particularly powerful in the presence of only a small number of causal variants in a variant set. Second, by combining different variant-set-level p values, we use ACAT to construct an omnibus test (ACAT-O) that combines the strength of multiple complimentary set-based tests, including the burden test, sequence kernel association test (SKAT), and ACAT-V. Through analysis of extensively simulated data and the whole-genome sequencing data from the Atherosclerosis Risk in Communities (ARIC) study, we demonstrate that ACAT-V complements the SKAT and the burden test, and that ACAT-O has a substantially more robust and higher power than those of the alternative tests.

}, issn = {1537-6605}, doi = {10.1016/j.ajhg.2019.01.002}, author = {Liu, Yaowu and Chen, Sixing and Li, Zilin and Morrison, Alanna C and Boerwinkle, Eric and Lin, Xihong} } @article {64, title = {A Bayesian framework that integrates multi-omics data and gene networks predicts risk genes from schizophrenia GWAS data.}, journal = {Nat Neurosci}, volume = {22}, year = {2019}, month = {2019 05}, pages = {691-699}, abstract = {

Genome-wide association studies (GWAS) have identified more than 100 schizophrenia (SCZ)-associated loci, but using these findings to illuminate disease biology remains a challenge. Here we present integrative risk gene selector (iRIGS), a Bayesian framework that integrates multi-omics data and gene networks to infer risk genes in GWAS loci. By applying iRIGS to SCZ GWAS data, we predicted a set of high-confidence risk genes, most of which are not the nearest genes to the GWAS index variants. High-confidence risk genes account for a significantly enriched heritability, as estimated by stratified linkage disequilibrium score regression. Moreover, high-confidence risk genes are predominantly expressed in brain tissues, especially prenatally, and are enriched for targets of approved drugs, suggesting opportunities to reposition existing drugs for SCZ. Thus, iRIGS can leverage accumulating functional genomics and GWAS data to advance our understanding of SCZ etiology and potential therapeutics.

}, keywords = {Animals, Bayes Theorem, Disease Models, Animal, Gene Regulatory Networks, Genetic Predisposition to Disease, Genome-Wide Association Study, Genomics, Humans, Mice, Risk Factors, Schizophrenia}, issn = {1546-1726}, doi = {10.1038/s41593-019-0382-7}, author = {Wang, Quan and Chen, Rui and Cheng, Feixiong and Wei, Qiang and Ji, Ying and Yang, Hai and Zhong, Xue and Tao, Ran and Wen, Zhexing and Sutcliffe, James S and Liu, Chunyu and Cook, Edwin H and Cox, Nancy J and Li, Bingshan} } @article {66, title = {Components of genetic associations across 2,138 phenotypes in the UK Biobank highlight adipocyte biology.}, journal = {Nat Commun}, volume = {10}, year = {2019}, month = {2019 Sep 06}, pages = {4064}, abstract = {

Population-based biobanks with genomic and dense phenotype data provide opportunities for generating effective therapeutic hypotheses and understanding the genomic role in disease predisposition. To characterize latent components of genetic associations, we apply truncated singular value decomposition (DeGAs) to matrices of summary statistics derived from genome-wide association analyses across 2,138 phenotypes measured in 337,199 White British individuals in the UK Biobank study. We systematically identify key components of genetic associations and the contributions of variants, genes, and phenotypes to each component. As an illustration of the utility of the approach to inform downstream experiments, we report putative loss of function variants, rs114285050 (GPR151) and rs150090666 (PDE3B), that substantially contribute to obesity-related traits and experimentally demonstrate the role of these genes in adipocyte biology. Our approach to dissect components of genetic associations across the human phenome will accelerate biomedical hypothesis generation by providing insights on previously unexplored latent structures.

}, issn = {2041-1723}, doi = {10.1038/s41467-019-11953-9}, author = {Tanigawa, Yosuke and Li, Jiehan and Justesen, Johanne M and Horn, Heiko and Aguirre, Matthew and DeBoever, Christopher and Chang, Chris and Narasimhan, Balasubramanian and Lage, Kasper and Hastie, Trevor and Park, Chong Y and Bejerano, Gill and Ingelsson, Erik and Rivas, Manuel A} } @article {60, title = {De novo pattern discovery enables robust assessment of functional consequences of non-coding variants.}, journal = {Bioinformatics}, volume = {35}, year = {2019}, month = {2019 May 01}, pages = {1453-1460}, abstract = {

MOTIVATION: Given the complexity of genome regions, prioritize the functional effects of non-coding variants remains a challenge. Although several frameworks have been proposed for the evaluation of the functionality of non-coding variants, most of them used {\textquoteright}black boxes{\textquoteright} methods that simplify the task as the pathogenicity/benign classification problem, which ignores the distinct regulatory mechanisms of variants and leads to less desirable performance. In this study, we developed DVAR, an unsupervised framework that leverage various biochemical and evolutionary evidence to distinguish the gene regulatory categories of variants and assess their comprehensive functional impact simultaneously.

RESULTS: DVAR performed de novo pattern discovery in high-dimensional data and identified five regulatory clusters of non-coding variants. Leveraging the new insights into the multiple functional patterns, it measures both the between-class and the within-class functional implication of the variants to achieve accurate prioritization. Compared to other two-class learning methods, it showed improved performance in identification of clinically significant variants, fine-mapped GWAS variants, eQTLs and expression-modulating variants. Moreover, it has superior performance on disease causal variants verified by genome-editing (like CRISPR-Cas9), which could provide a pre-selection strategy for genome-editing technologies across the whole genome. Finally, evaluated in BioVU and UK Biobank, two large-scale DNA biobanks linked to complete electronic health records, DVAR demonstrated its effectiveness in prioritizing non-coding variants associated with medical phenotypes.

AVAILABILITY AND IMPLEMENTATION: The C++ and Python source codes, the pre-computed DVAR-cluster labels and DVAR-scores across the whole genome are available at https://www.vumc.org/cgg/dvar.

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.

}, issn = {1367-4811}, doi = {10.1093/bioinformatics/bty826}, author = {Yang, Hai and Chen, Rui and Wang, Quan and Wei, Qiang and Ji, Ying and Zheng, Guangze and Zhong, Xue and Cox, Nancy J and Li, Bingshan} } @article {52, title = {Differential NOVA2-Mediated Splicing in Excitatory and Inhibitory Neurons Regulates Cortical Development and Cerebellar Function.}, journal = {Neuron}, volume = {101}, year = {2019}, month = {2019 02 20}, pages = {707-720.e5}, abstract = {

RNA-binding proteins (RBPs) regulate genetic diversity, but the degree to which they do so in individual cell types in~vivo is unknown. We developed NOVA2 cTag-crosslinking and immunoprecipitation (CLIP) to generate functional RBP-RNA maps from different neuronal populations in the mouse brain. Combining cell type datasets from Nova2-cTag and Nova2 conditional knockout mice revealed differential NOVA2 regulatory actions on alternative splicing (AS) on the same transcripts expressed in different neurons. This includes functional differences in transcripts expressed in cortical and cerebellar excitatory versus inhibitory neurons, where we find NOVA2 is required for, respectively, development of laminar structure, motor coordination, and synapse formation. We also find that NOVA2-regulated AS is coupled to NOVA2 regulation of intron retention in hundreds of transcripts, which can sequester the trans-acting splicing factor PTBP2. In summary, cTag-CLIP complements single-cell RNA sequencing (RNA-seq) studies by providing a means for understanding RNA regulation of functional cell diversity.

}, keywords = {Alternative Splicing, Animals, Antigens, Neoplasm, Cells, Cultured, Cerebellum, Cerebral Cortex, Excitatory Postsynaptic Potentials, Female, Inhibitory Postsynaptic Potentials, Male, Mice, Mice, Inbred C57BL, Nerve Tissue Proteins, Neurogenesis, Neurons, Polypyrimidine Tract-Binding Protein, RNA-Binding Proteins}, issn = {1097-4199}, doi = {10.1016/j.neuron.2018.12.019}, author = {Saito, Yuhki and Yuan, Yuan and Zucker-Scharff, Ilana and Fak, John J and Jereb, Sa{\v s}a and Tajima, Yoko and Licatalosi, Donny D and Darnell, Robert B} } @article {71, title = {Drug-Resistant Juvenile Myoclonic Epilepsy: Misdiagnosis of Progressive Myoclonus Epilepsy.}, journal = {Front Neurol}, volume = {10}, year = {2019}, month = {2019}, pages = {946}, abstract = {

Juvenile myoclonic epilepsy (JME) is a common epilepsy syndrome characterized by bilateral myoclonic and tonic-clonic seizures typically starting in adolescence and responding well to medication. Misdiagnosis of a more severe progressive myoclonus epilepsy (PME) as JME has been suggested as a cause of drug-resistance. Medical records of the Epilepsy Center Hessen-Marburg between 2005 and 2014 were automatically selected using keywords and manually reviewed regarding the presence of a JME diagnosis at any timepoint. The identified patients were evaluated regarding seizure outcome and drug resistance according to ILAE criteria. 87/168 identified JME patients were seizure-free at last follow-up including 61 drug-responsive patients (group NDR). Seventy-eight patients were not seizure-free including 26 drug-resistant patients (group DR). Valproate was the most efficacious AED. The JME diagnosis was revised in 7 patients of group DR including 6 in whom the diagnosis had already been questioned or revised during clinical follow-up. One of these was finally diagnosed with PME (genetically confirmed Lafora disease) based on genetic testing. She was initially reviewed at age 29 yrs and considered to be inconsistent with PME. Intellectual disability ( = 0.025), cognitive impairment ( < 0.001), febrile seizures in first-degree relatives ( = 0.023) and prominent dialeptic seizures ( = 0.009) where significantly more frequent in group DR. Individuals with PME are rarely found among drug-resistant alleged JME patients in a tertiary epilepsy center. Even a very detailed review by experienced epileptologists may not identify the presence of PME before the typical features evolve underpinning the need for early genetic testing in drug-resistant JME patients.

}, issn = {1664-2295}, doi = {10.3389/fneur.2019.00946}, author = {Martin, Sarah and Strzelczyk, Adam and Lindlar, Silvia and Krause, Kristina and Reif, Philipp S and Menzler, Katja and Chiocchetti, Andreas G and Rosenow, Felix and Knake, Susanne and Klein, Karl Martin} } @article {50, title = {Efficient Variant Set Mixed Model Association Tests for Continuous and Binary Traits in Large-Scale Whole-Genome Sequencing Studies.}, journal = {Am J Hum Genet}, volume = {104}, year = {2019}, month = {2019 02 07}, pages = {260-274}, abstract = {

With advances in whole-genome sequencing (WGS) technology, more advanced statistical methods for testing genetic association with rare variants are being developed. Methods in which variants are grouped for analysis are also known as variant-set, gene-based, and aggregate unit tests. The burden test and sequence kernel association test (SKAT) are two widely used variant-set tests, which were originally developed for samples of unrelated individuals and later have been extended to family data with known pedigree structures. However, computationally efficient and powerful variant-set tests are needed to make analyses tractable in large-scale WGS studies with complex study samples. In this paper, we propose the variant-set mixed model association tests (SMMAT) for continuous and binary traits using the generalized linear mixed model framework. These tests can be applied to large-scale WGS studies involving samples with population structure and relatedness, such as in the National Heart, Lung, and Blood Institute{\textquoteright}s Trans-Omics for Precision Medicine (TOPMed) program. SMMATs share the same null model for different variant sets, and a virtue of this null model, which includes covariates only, is that it needs to be fit only once for all tests in each genome-wide analysis. Simulation studies show that all the proposed SMMATs correctly control type I error rates for both continuous and binary traits in the presence of population structure and relatedness. We also illustrate our tests in a real data example of analysis of plasma fibrinogen levels in the TOPMed program (n = 23,763), using the Analysis Commons, a cloud-based computing platform.

}, keywords = {Chromosomes, Human, Pair 4, Cloud Computing, Female, Fibrinogen, Genetic Association Studies, Genetics, Population, Humans, Male, Models, Genetic, National Heart, Lung, and Blood Institute (U.S.), Precision Medicine, Research Design, Time Factors, United States, Whole Genome Sequencing}, issn = {1537-6605}, doi = {10.1016/j.ajhg.2018.12.012}, author = {Chen, Han and Huffman, Jennifer E and Brody, Jennifer A and Wang, Chaolong and Lee, Seunggeun and Li, Zilin and Gogarten, Stephanie M and Sofer, Tamar and Bielak, Lawrence F and Bis, Joshua C and Blangero, John and Bowler, Russell P and Cade, Brian E and Cho, Michael H and Correa, Adolfo and Curran, Joanne E and de Vries, Paul S and Glahn, David C and Guo, Xiuqing and Johnson, Andrew D and Kardia, Sharon and Kooperberg, Charles and Lewis, Joshua P and Liu, Xiaoming and Mathias, Rasika A and Mitchell, Braxton D and O{\textquoteright}Connell, Jeffrey R and Peyser, Patricia A and Post, Wendy S and Reiner, Alex P and Rich, Stephen S and Rotter, Jerome I and Silverman, Edwin K and Smith, Jennifer A and Vasan, Ramachandran S and Wilson, James G and Yanek, Lisa R and Redline, Susan and Smith, Nicholas L and Boerwinkle, Eric and Borecki, Ingrid B and Cupples, L Adrienne and Laurie, Cathy C and Morrison, Alanna C and Rice, Kenneth M and Lin, Xihong} } @article {73, title = {Exome sequencing reveals a high prevalence of BRCA1 and BRCA2 founder variants in a diverse population-based biobank.}, journal = {Genome Med}, volume = {12}, year = {2019}, month = {2019 Dec 31}, pages = {2}, abstract = {

BACKGROUND: Pathogenic variants in BRCA1 and BRCA2 (BRCA1/2) lead to increased risk of breast, ovarian, and other cancers, but most variant-positive individuals in the general population are unaware of their risk, and little is known about prevalence in non-European populations. We investigated BRCA1/2 prevalence and impact in the electronic health record (EHR)-linked BioMe Biobank in New York City.

METHODS: Exome sequence data from 30,223 adult BioMe participants were evaluated for pathogenic variants in BRCA1/2. Prevalence estimates were made in population groups defined by genetic ancestry and self-report. EHR data were used to evaluate clinical characteristics of variant-positive individuals.

RESULTS: There were 218 (0.7\%) individuals harboring expected pathogenic variants, resulting in an overall prevalence of 1 in 139. The highest prevalence was in individuals with Ashkenazi Jewish (AJ; 1 in 49), Filipino and other~Southeast Asian (1 in 81), and non-AJ European (1 in 103) ancestry. Among 218 variant-positive individuals, 112 (51.4\%) harbored known founder variants: 80 had AJ founder variants (BRCA1 c.5266dupC and c.68_69delAG, and BRCA2 c.5946delT), 8 had a Puerto Rican founder variant (BRCA2 c.3922G>T), and 24 had one of 19 other founder variants. Non-European populations were more likely to harbor BRCA1/2 variants that were not classified in ClinVar or that had uncertain or conflicting evidence for pathogenicity (uncertain/conflicting). Within mixed ancestry populations, such as Hispanic/Latinos with genetic ancestry from Africa, Europe, and the Americas, there was a strong correlation between the proportion of African genetic ancestry and the likelihood of harboring an uncertain/conflicting variant. Approximately 28\% of variant-positive individuals had a personal history, and 45\% had a personal or family history of BRCA1/2-associated cancers. Approximately 27\% of variant-positive individuals had prior clinical genetic testing for BRCA1/2. However, individuals with AJ founder variants were twice as likely to have had a clinical test (39\%) than those with other pathogenic variants (20\%).

CONCLUSIONS: These findings deepen our knowledge about BRCA1/2 variants and associated cancer risk in diverse populations, indicate a gap in knowledge about potential cancer-related variants in non-European populations, and suggest that genomic screening in diverse patient populations may be an effective tool to identify at-risk individuals.

}, issn = {1756-994X}, doi = {10.1186/s13073-019-0691-1}, author = {Abul-Husn, Noura S and Soper, Emily R and Odgis, Jacqueline A and Cullina, Sinead and Bobo, Dean and Moscati, Arden and Rodriguez, Jessica E and Loos, Ruth J F and Cho, Judy H and Belbin, Gillian M and Suckiel, Sabrina A and Kenny, Eimear E} } @article {62, title = {Genome sequencing analysis identifies Epstein-Barr virus subtypes associated with high risk of nasopharyngeal carcinoma.}, journal = {Nat Genet}, volume = {51}, year = {2019}, month = {2019 Jul}, pages = {1131-1136}, abstract = {

Epstein-Barr virus (EBV) infection is ubiquitous worldwide and is associated with multiple cancers, including nasopharyngeal carcinoma (NPC). The importance of EBV viral genomic variation in NPC development and its striking epidemic in southern China has been poorly explored. Through large-scale genome sequencing of 270 EBV isolates and two-stage association study of EBV isolates from China, we identify two non-synonymous EBV variants within BALF2 that are strongly associated with the risk of NPC (odds ratio (OR) = 8.69, P = 9.69 {\texttimes} 10 for SNP 162476_C; OR = 6.14, P = 2.40 {\texttimes} 10 for SNP 163364_T). The cumulative effects of these variants contribute to 83\% of the overall risk of NPC in southern China. Phylogenetic analysis of the risk variants reveals a unique origin in Asia, followed by clonal expansion in NPC-endemic regions. Our results provide novel insights into the NPC endemic in southern China and also enable the identification of high-risk individuals for NPC prevention.

}, issn = {1546-1718}, doi = {10.1038/s41588-019-0436-5}, author = {Xu, Miao and Yao, Youyuan and Chen, Hui and Zhang, Shanshan and Cao, Su-Mei and Zhang, Zhe and Luo, Bing and Liu, Zhiwei and Li, Zilin and Xiang, Tong and He, Guiping and Feng, Qi-Sheng and Chen, Li-Zhen and Guo, Xiang and Jia, Wei-Hua and Chen, Ming-Yuan and Zhang, Xiao and Xie, Shang-Hang and Peng, Roujun and Chang, Ellen T and Pedergnana, Vincent and Feng, Lin and Bei, Jin-Xin and Xu, Rui-Hua and Zeng, Mu-Sheng and Ye, Weimin and Adami, Hans-Olov and Lin, Xihong and Zhai, Weiwei and Zeng, Yi-Xin and Liu, Jianjun} } @article {41, title = {Global Biobank Engine: enabling genotype-phenotype browsing for biobank summary statistics.}, journal = {Bioinformatics}, volume = {35}, year = {2019}, month = {2019 Jul 15}, pages = {2495-2497}, abstract = {

SUMMARY: Large biobanks linking phenotype to genotype have led to an explosion of genetic association studies across a wide range of phenotypes. Sharing the knowledge generated by these resources with the scientific community remains a challenge due to patient privacy and the vast amount of data. Here, we present Global Biobank Engine (GBE), a web-based tool that enables exploration of the relationship between genotype and phenotype in biobank cohorts, such as the UK Biobank. GBE supports browsing for results from genome-wide association studies, phenome-wide association studies, gene-based tests and genetic correlation between phenotypes. We envision GBE as a platform that facilitates the dissemination of summary statistics from biobanks to the scientific and clinical communities.

AVAILABILITY AND IMPLEMENTATION: GBE currently hosts data from the UK Biobank and can be found freely available at biobankengine.stanford.edu.

}, issn = {1367-4811}, doi = {10.1093/bioinformatics/bty999}, author = {McInnes, Gregory and Tanigawa, Yosuke and DeBoever, Chris and Lavertu, Adam and Olivieri, Julia Eve and Aguirre, Matthew and Rivas, Manuel A} } @article {56, title = {GRIK5 Genetically Regulated Expression Associated with Eye and Vascular Phenomes: Discovery through Iteration among Biobanks, Electronic Health Records, and Zebrafish.}, journal = {Am J Hum Genet}, volume = {104}, year = {2019}, month = {2019 Mar 07}, pages = {503-519}, abstract = {

Although the use of model systems for studying the mechanism of mutations that have a large effect is common, we highlight here the ways that zebrafish-model-system studies of a gene, GRIK5, that contributes to the polygenic liability to develop eye diseases have helped to illuminate a mechanism that implicates vascular biology in eye disease. A gene-expression prediction derived from a reference transcriptome panel applied to BioVU, a large electronic health record (EHR)-linked biobank at Vanderbilt University Medical Center, implicated reduced GRIK5 expression in diverse eye diseases. We tested the function of GRIK5 by depletion of its ortholog in zebrafish, and we observed reduced blood vessel numbers and integrity in the eye and increased vascular permeability. Analyses of EHRs in >2.6 million Vanderbilt subjects revealed significant comorbidity of eye and vascular diseases (relative risks 2-15); this comorbidity was confirmed in 150 million individuals from a large insurance claims dataset. Subsequent studies in >60,000 genotyped BioVU participants confirmed the association of reduced genetically predicted expression of GRIK5 with comorbid vascular and eye diseases. Our studies pioneer an approach that allows a rapid iteration of the discovery of gene-phenotype relationships to the primary genetic mechanism contributing to the pathophysiology of human disease. Our findings also add dimension to the understanding of the biology driven by glutamate receptors such as GRIK5 (also referred to as GLUK5 in protein form) and to mechanisms contributing to human eye diseases.

}, issn = {1537-6605}, doi = {10.1016/j.ajhg.2019.01.017}, author = {Unlu, Gokhan and Gamazon, Eric R and Qi, Xinzi and Levic, Daniel S and Bastarache, Lisa and Denny, Joshua C and Roden, Dan M and Mayzus, Ilya and Breyer, Max and Zhong, Xue and Konkashbaev, Anuar I and Rzhetsky, Andrey and Knapik, Ela W and Cox, Nancy J} } @article {63, title = {Identification of rare-disease genes using blood transcriptome sequencing and large control cohorts.}, journal = {Nat Med}, volume = {25}, year = {2019}, month = {2019 06}, pages = {911-919}, abstract = {

It is estimated that 350 million individuals worldwide suffer from rare diseases, which are predominantly caused by mutation in a single gene. The current molecular diagnostic rate is estimated at 50\%, with whole-exome sequencing (WES) among the most successful approaches. For patients in whom WES is uninformative, RNA sequencing (RNA-seq) has shown diagnostic utility in specific tissues and diseases. This includes muscle biopsies from patients with undiagnosed rare muscle disorders, and cultured fibroblasts from patients with mitochondrial disorders. However, for many individuals, biopsies are not performed for clinical care, and tissues are difficult to access. We sought to assess the utility of RNA-seq from blood as a diagnostic tool for rare diseases of different pathophysiologies. We generated whole-blood RNA-seq from 94 individuals with undiagnosed rare diseases spanning 16 diverse disease categories. We developed a robust approach to compare data from these individuals with large sets of RNA-seq data for controls (n = 1,594 unrelated controls and n = 49 family members) and demonstrated the impacts of expression, splicing, gene and variant filtering strategies on disease gene identification. Across our cohort, we observed that RNA-seq yields a 7.5\% diagnostic rate, and an additional 16.7\% with improved candidate gene resolution.

}, keywords = {Acid Ceramidase, Case-Control Studies, Child, Child, Preschool, Cohort Studies, Female, Genetic Variation, Humans, Male, Models, Genetic, Mutation, Oxidoreductases Acting on CH-CH Group Donors, Potassium Channels, Rare Diseases, RNA, RNA Splicing, Sequence Analysis, RNA, Whole Exome Sequencing}, issn = {1546-170X}, doi = {10.1038/s41591-019-0457-8}, author = {Fr{\'e}sard, Laure and Smail, Craig and Ferraro, Nicole M and Teran, Nicole A and Li, Xin and Smith, Kevin S and Bonner, Devon and Kernohan, Kristin D and Marwaha, Shruti and Zappala, Zachary and Balliu, Brunilda and Davis, Joe R and Liu, Boxiang and Prybol, Cameron J and Kohler, Jennefer N and Zastrow, Diane B and Reuter, Chloe M and Fisk, Dianna G and Grove, Megan E and Davidson, Jean M and Hartley, Taila and Joshi, Ruchi and Strober, Benjamin J and Utiramerur, Sowmithri and Lind, Lars and Ingelsson, Erik and Battle, Alexis and Bejerano, Gill and Bernstein, Jonathan A and Ashley, Euan A and Boycott, Kym M and Merker, Jason D and Wheeler, Matthew T and Montgomery, Stephen B} } @article {55, title = {Methods for the Analysis and Interpretation for Rare Variants Associated with Complex Traits.}, journal = {Curr Protoc Hum Genet}, volume = {101}, year = {2019}, month = {2019 04}, pages = {e83}, abstract = {

With the advent of Next Generation Sequencing (NGS) technologies, whole genome and whole exome DNA sequencing has become affordable for routine genetic studies. Coupled with improved genotyping arrays and genotype imputation methodologies, it is increasingly feasible to obtain rare genetic variant information in large datasets. Such datasets allow researchers to gain a more complete understanding of the genetic architecture of complex traits caused by rare variants. State-of-the-art statistical methods for the statistical genetics analysis of sequence-based association, including efficient algorithms for association analysis in biobank-scale datasets, gene-association tests, meta-analysis, fine mapping methods that integrate functional genomic dataset, and phenome-wide association studies (PheWAS), are reviewed here. These methods are expected to be highly useful for next generation statistical genetics analysis in the era of precision medicine. {\textcopyright} 2019 by John Wiley \& Sons, Inc.

}, keywords = {Algorithms, Genetic Predisposition to Disease, Genome, Human, Genome-Wide Association Study, Genotype, High-Throughput Nucleotide Sequencing, Humans, Multifactorial Inheritance, Phenotype, Polymorphism, Single Nucleotide, Whole Exome Sequencing, Whole Genome Sequencing}, issn = {1934-8258}, doi = {10.1002/cphg.83}, author = {Weissenkampen, J Dylan and Jiang, Yu and Eckert, Scott and Jiang, Bibo and Li, Bingshan and Liu, Dajiang J} } @article {51, title = {A multi-task convolutional deep neural network for variant calling in single molecule sequencing.}, journal = {Nat Commun}, volume = {10}, year = {2019}, month = {2019 03 01}, pages = {998}, abstract = {

The accurate identification of DNA sequence variants is an important, but challenging task in genomics. It is particularly difficult for single molecule sequencing, which has a per-nucleotide error rate of ~5-15\%. Meeting this demand, we developed Clairvoyante, a multi-task five-layer convolutional neural network model for predicting variant type (SNP or indel), zygosity, alternative allele and indel length from aligned reads. For the well-characterized NA12878 human sample, Clairvoyante achieves 99.67, 95.78, 90.53\% F1-score on 1KP common variants, and 98.65, 92.57, 87.26\% F1-score for whole-genome analysis, using Illumina, PacBio, and Oxford Nanopore data, respectively. Training on a second human sample shows Clairvoyante is sample agnostic and finds variants in less than 2 h on a standard server. Furthermore, we present 3,135 variants that are missed using Illumina but supported independently by both PacBio and Oxford Nanopore reads. Clairvoyante is available open-source ( https://github.com/aquaskyline/Clairvoyante ), with modules to train, utilize and visualize the model.

}, keywords = {Base Sequence, Computational Biology, DNA Mutational Analysis, Genome, Human, Genome-Wide Association Study, Genomics, Genotype, Genotyping Techniques, Humans, INDEL Mutation, Nanopores, Neural Networks, Computer, Polymorphism, Single Nucleotide, Sequence Analysis, DNA, Software}, issn = {2041-1723}, doi = {10.1038/s41467-019-09025-z}, author = {Luo, Ruibang and Sedlazeck, Fritz J and Lam, Tak-Wah and Schatz, Michael C} } @article {48, title = {Quantification of frequency-dependent genetic architectures in 25 UK Biobank traits reveals action of negative selection.}, journal = {Nat Commun}, volume = {10}, year = {2019}, month = {2019 02 15}, pages = {790}, abstract = {

Understanding the role of rare variants is important in elucidating the genetic basis of human disease. Negative selection can cause rare variants to have larger per-allele effect sizes than common variants. Here, we develop a method to estimate the minor allele frequency (MAF) dependence of SNP effect sizes. We use a model in which per-allele effect sizes have variance proportional to [p(1 - p)], where p is the MAF and negative values of α imply larger effect sizes for rare variants. We estimate α for 25 UK Biobank diseases and complex traits. All traits produce negative α estimates, with best-fit mean of -0.38 (s.e. 0.02) across traits. Despite larger rare variant effect sizes, rare variants (MAF < 1\%) explain less than 10\% of total SNP-heritability for most traits analyzed. Using evolutionary modeling and forward simulations, we validate the α model of MAF-dependent trait effects and assess plausible values of relevant evolutionary parameters.

}, keywords = {Algorithms, Alleles, Biological Specimen Banks, Gene Frequency, Genome-Wide Association Study, Genotype, Humans, Models, Genetic, Polymorphism, Single Nucleotide, Quantitative Trait, Heritable, Selection, Genetic, United Kingdom}, issn = {2041-1723}, doi = {10.1038/s41467-019-08424-6}, author = {Schoech, Armin P and Jordan, Daniel M and Loh, Po-Ru and Gazal, Steven and O{\textquoteright}Connor, Luke J and Balick, Daniel J and Palamara, Pier F and Finucane, Hilary K and Sunyaev, Shamil R and Price, Alkes L} } @article {35, title = {A common loss-of-function variant is associated with lower vitamin B concentration in African Americans.}, journal = {Blood}, volume = {131}, year = {2018}, month = {2018 06 21}, pages = {2859-2863}, keywords = {African Americans, Female, Genome-Wide Association Study, Humans, Loss of Function Mutation, Male, Polymorphism, Single Nucleotide, Transcobalamins, Vitamin B 12, Vitamin B Deficiency}, issn = {1528-0020}, doi = {10.1182/blood-2018-03-841023}, author = {Hu, Yao and Raffield, Laura M and Polfus, Linda M and Moscati, Arden and Nadkarni, Girish and Preuss, Michael H and Zhong, Xue and Wei, Qiang and Rich, Stephen S and Li, Yun and Wilson, James G and Correa, Adolfo and Loos, Ruth J F and Li, Bingshan and Auer, Paul L and Reiner, Alex P} } @article {44, title = {Fine-mapping and functional studies highlight potential causal variants for rheumatoid arthritis and type 1 diabetes.}, journal = {Nat Genet}, volume = {50}, year = {2018}, month = {2018 10}, pages = {1366-1374}, abstract = {

To define potentially causal variants for autoimmune disease, we fine-mapped 76 rheumatoid arthritis (11,475 cases, 15,870 controls) and type 1 diabetes loci (9,334 cases, 11,111 controls). After sequencing 799 1-kilobase regulatory (H3K4me3) regions within these loci in 568 individuals, we observed accurate imputation for 89\% of common variants. We defined credible sets of <=5 causal variants at 5 rheumatoid arthritis and 10 type 1 diabetes loci. We identified potentially causal missense variants at DNASE1L3, PTPN22, SH2B3, and TYK2, and noncoding variants at MEG3, CD28-CTLA4, and IL2RA. We also identified potential candidate causal variants at SIRPG and TNFAIP3. Using functional assays, we confirmed allele-specific protein binding and differential enhancer activity for three variants: the CD28-CTLA4 rs117701653 SNP, MEG3 rs34552516 indel, and TNFAIP3 rs35926684 indel.

}, keywords = {Alleles, Arthritis, Rheumatoid, Case-Control Studies, CD28 Antigens, Chromosome Mapping, CTLA-4 Antigen, Diabetes Mellitus, Type 1, Gene Frequency, Genetic Loci, Genetic Predisposition to Disease, Genome-Wide Association Study, Humans, Jurkat Cells, Mutation, Polymorphism, Single Nucleotide, Quantitative Trait Loci, RNA, Long Noncoding, Tumor Necrosis Factor alpha-Induced Protein 3}, issn = {1546-1718}, doi = {10.1038/s41588-018-0216-7}, author = {Westra, Harm-Jan and Mart{\'\i}nez-Bonet, Marta and Onengut-Gumuscu, Suna and Lee, Annette and Luo, Yang and Teslovich, Nikola and Worthington, Jane and Martin, Javier and Huizinga, Tom and Klareskog, Lars and Rantapaa-Dahlqvist, Solbritt and Chen, Wei-Min and Quinlan, Aaron and Todd, John A and Eyre, Steve and Nigrovic, Peter A and Gregersen, Peter K and Rich, Stephen S and Raychaudhuri, Soumya} } @article {40, title = {Functional architecture of low-frequency variants highlights strength of negative selection across coding and non-coding annotations.}, journal = {Nat Genet}, volume = {50}, year = {2018}, month = {2018 11}, pages = {1600-1607}, abstract = {

Common variant heritability has been widely reported to be concentrated in variants within cell-type-specific non-coding functional annotations, but little is known about low-frequency variant functional architectures. We partitioned the heritability of both low-frequency (0.5\%<= minor allele frequency <5\%) and common (minor allele frequency >=5\%) variants in 40 UK Biobank traits across a broad set of functional annotations. We determined that non-synonymous coding variants explain 17 {\textpm} 1\% of low-frequency variant heritability ([Formula: see text]) versus 2.1 {\textpm} 0.2\% of common variant heritability ([Formula: see text]). Cell-type-specific non-coding annotations that were significantly enriched for [Formula: see text] of corresponding traits were similarly enriched for [Formula: see text] for most traits, but more enriched for brain-related annotations and traits. For example, H3K4me3 marks in brain dorsolateral prefrontal cortex explain 57 {\textpm} 12\% of [Formula: see text] versus 12 {\textpm} 2\% of [Formula: see text] for neuroticism. Forward simulations confirmed that low-frequency variant enrichment depends on the mean selection coefficient of causal variants in the annotation, and can be used to predict effect size variance of causal rare variants (minor allele frequency <0.5\%).

}, keywords = {Alleles, Biological Specimen Banks, European Continental Ancestry Group, Gene Frequency, Genetic Variation, Genetics, Population, Genome-Wide Association Study, Humans, Linkage Disequilibrium, Molecular Sequence Annotation, Open Reading Frames, Polymorphism, Single Nucleotide, Selection, Genetic, United Kingdom}, issn = {1546-1718}, doi = {10.1038/s41588-018-0231-8}, author = {Gazal, Steven and Loh, Po-Ru and Finucane, Hilary K and Ganna, Andrea and Schoech, Armin and Sunyaev, Shamil and Price, Alkes L} } @article {43, title = {Functional equivalence of genome sequencing analysis pipelines enables harmonized variant calling across human genetics projects.}, journal = {Nat Commun}, volume = {9}, year = {2018}, month = {2018 10 02}, pages = {4038}, abstract = {

Hundreds of thousands of human whole genome sequencing (WGS) datasets will be generated over the next few years. These data are more valuable in aggregate: joint analysis of genomes from many sources increases sample size and statistical power. A central challenge for joint analysis is that different WGS data processing pipelines cause substantial differences in variant calling in combined datasets, necessitating computationally expensive reprocessing. This approach is no longer tenable given the scale of current studies and data volumes. Here, we define WGS data processing standards that allow different groups to produce functionally equivalent (FE) results, yet still innovate on data processing pipelines. We present initial FE pipelines developed at five genome centers and show that they yield similar variant calling results and produce significantly less variability than sequencing replicates. This work alleviates a key technical bottleneck for genome aggregation and helps lay the foundation for community-wide human genetics studies.

}, keywords = {Genome, Human, Human Genetics, Humans, Whole Genome Sequencing}, issn = {2041-1723}, doi = {10.1038/s41467-018-06159-4}, author = {Regier, Allison A and Farjoun, Yossi and Larson, David E and Krasheninina, Olga and Kang, Hyun Min and Howrigan, Daniel P and Chen, Bo-Juen and Kher, Manisha and Banks, Eric and Ames, Darren C and English, Adam C and Li, Heng and Xing, Jinchuan and Zhang, Yeting and Matise, Tara and Abecasis, Gon{\c c}alo R and Salerno, Will and Zody, Michael C and Neale, Benjamin M and Hall, Ira M} } @article {28, title = {Medical relevance of protein-truncating variants across 337,205 individuals in the UK Biobank study.}, journal = {Nat Commun}, volume = {9}, year = {2018}, month = {2018 04 24}, pages = {1612}, abstract = {

Protein-truncating variants can have profound effects on gene function and are critical for clinical genome interpretation and generating therapeutic hypotheses, but their relevance to medical phenotypes has not been systematically assessed. Here, we characterize the effect of 18,228 protein-truncating variants across 135 phenotypes from the UK Biobank and find 27 associations between medical phenotypes and protein-truncating variants in genes outside the major histocompatibility complex. We perform phenome-wide analyses and directly measure the effect in homozygous carriers, commonly referred to as "human knockouts," across medical phenotypes for genes implicated as being protective against disease or associated with at least one phenotype in our study. We find several genes with strong pleiotropic or non-additive effects. Our results illustrate the importance of protein-truncating variants in a variety of diseases.

}, keywords = {Databases, Nucleic Acid, Genome-Wide Association Study, Humans, Phenotype, Proteins, Sequence Deletion, United Kingdom}, issn = {2041-1723}, doi = {10.1038/s41467-018-03910-9}, author = {DeBoever, Christopher and Tanigawa, Yosuke and Lindholm, Malene E and McInnes, Greg and Lavertu, Adam and Ingelsson, Erik and Chang, Chris and Ashley, Euan A and Bustamante, Carlos D and Daly, Mark J and Rivas, Manuel A} } @article {12, title = {Multiple phenotype association tests using summary statistics in genome-wide association studies.}, journal = {Biometrics}, volume = {74}, year = {2018}, month = {2018 Mar}, pages = {165-175}, abstract = {

We study in this article jointly testing the associations of a genetic variant with correlated multiple phenotypes using the summary statistics of individual phenotype analysis from Genome-Wide Association Studies (GWASs). We estimated the between-phenotype correlation matrix using the summary statistics of individual phenotype GWAS analyses, and developed genetic association tests for multiple phenotypes by accounting for between-phenotype correlation without the need to access individual-level data. Since genetic variants often affect multiple phenotypes differently across the genome and the between-phenotype correlation can be arbitrary, we proposed robust and powerful multiple phenotype testing procedures by jointly testing a common mean and a variance component in linear mixed models for summary statistics. We computed the p-values of the proposed tests analytically. This computational advantage makes our methods practically appealing in large-scale GWASs. We performed simulation studies to show that the proposed tests maintained correct type I error rates, and to compare their powers in various settings with the existing methods. We applied the proposed tests to a GWAS Global Lipids Genetics Consortium summary statistics data set and identified additional genetic variants that were missed by the original single-trait analysis.

}, keywords = {Analysis of Variance, Computer Simulation, Genome-Wide Association Study, Humans, Linear Models, Lipids, Models, Genetic, Phenotype}, issn = {1541-0420}, doi = {10.1111/biom.12735}, author = {Liu, Zhonghua and Lin, Xihong} } @article {34, title = {A synthetic-diploid benchmark for accurate variant-calling evaluation.}, journal = {Nat Methods}, volume = {15}, year = {2018}, month = {2018 08}, pages = {595-597}, abstract = {

Existing benchmark datasets for use in evaluating variant-calling accuracy are constructed from a consensus of known short-variant callers, and they are thus biased toward easy regions that are accessible by these algorithms. We derived a new benchmark dataset from the de novo PacBio assemblies of two fully homozygous human cell lines, which provides a relatively more accurate and less biased estimate of small-variant-calling error rates in a realistic context.

}, keywords = {Algorithms, Benchmarking, Cell Line, Tumor, Databases, Genetic, Diploidy, Female, Genetic Variation, Genome, Human, Homozygote, Humans, Hydatidiform Mole, Pregnancy, Synthetic Biology, Uterine Neoplasms, Whole Genome Sequencing}, issn = {1548-7105}, doi = {10.1038/s41592-018-0054-7}, author = {Li, Heng and Bloom, Jonathan M and Farjoun, Yossi and Fleharty, Mark and Gauthier, Laura and Neale, Benjamin and MacArthur, Daniel} } @article {20, title = {Testing for gene-environment interaction under exposure misspecification.}, journal = {Biometrics}, volume = {74}, year = {2018}, month = {2018 Jun}, pages = {653-662}, abstract = {

Complex interplay between genetic and environmental factors characterizes the etiology of many diseases. Modeling gene-environment (GxE) interactions is often challenged by the unknown functional form of the environment term in the true data-generating mechanism. We study the impact of misspecification of the environmental exposure effect on inference for the GxE interaction term in linear and logistic regression models. We first examine the asymptotic bias of the GxE interaction regression coefficient, allowing for confounders as well as arbitrary misspecification of the exposure and confounder effects. For linear regression, we show that under gene-environment independence and some confounder-dependent conditions, when the environment effect is misspecified, the regression coefficient of the GxE interaction can be unbiased. However, inference on the GxE interaction is still often incorrect. In logistic regression, we show that the regression coefficient is generally biased if the genetic factor is associated with the outcome directly or indirectly. Further, we show that the standard robust sandwich variance estimator for the GxE interaction does not perform well in practical GxE studies, and we provide an alternative testing procedure that has better finite sample properties.

}, keywords = {Bias, Confounding Factors, Epidemiologic, Environmental Exposure, Gene-Environment Interaction, Humans, Linear Models, Models, Genetic, Scientific Experimental Error}, issn = {1541-0420}, doi = {10.1111/biom.12813}, author = {Sun, Ryan and Carroll, Raymond J and Christiani, David C and Lin, Xihong} } @article {7, title = {The Effects of Migration and Assortative Mating on Admixture Linkage Disequilibrium.}, journal = {Genetics}, volume = {205}, year = {2017}, month = {2017 Jan}, pages = {375-383}, abstract = {

Statistical models in medical and population genetics typically assume that individuals assort randomly in a population. While this simplifies model complexity, it contradicts an increasing body of evidence of nonrandom mating in human populations. Specifically, it has been shown that assortative mating is significantly affected by genomic ancestry. In this work, we examine the effects of ancestry-assortative mating on the linkage disequilibrium between local ancestry tracks of individuals in an admixed population. To accomplish this, we develop an extension to the Wright-Fisher model that allows for ancestry-based assortative mating. We show that ancestry-assortment perturbs the distribution of local ancestry linkage disequilibrium (LAD) and the variance of ancestry in a population as a function of the number of generations since admixture. This assortment effect can induce errors in demographic inference of admixed populations when methods assume random mating. We derive closed form formulae for LAD under an assortative-mating model with and without migration. We observe that LAD depends on the correlation of global ancestry of couples in each generation, the migration rate of each of the ancestral populations, the initial proportions of ancestral populations, and the number of generations since admixture. We also present the first direct evidence of ancestry-assortment in African Americans and examine LAD in simulated and real admixed population data of African Americans. We find that demographic inference under the assumption of random mating significantly underestimates the number of generations since admixture, and that accounting for assortative mating using the patterns of LAD results in estimates that more closely agrees with the historical narrative.

}, keywords = {Alleles, Black or African American, Datasets as Topic, Gene Frequency, Genetics, Population, Genomics, Human Migration, Humans, Linkage Disequilibrium, Models, Genetic, Models, Statistical, Polymorphism, Single Nucleotide}, issn = {1943-2631}, doi = {10.1534/genetics.116.192138}, author = {Zaitlen, Noah and Huntsman, Scott and Hu, Donglei and Spear, Melissa and Eng, Celeste and Oh, Sam S and White, Marquitta J and Mak, Angel and Davis, Adam and Meade, Kelly and Brigino-Buenaventura, Emerita and LeNoir, Michael A and Bibbins-Domingo, Kirsten and Burchard, Esteban G and Halperin, Eran} } @article {46, title = {The Generalized Higher Criticism for Testing SNP-Set Effects in Genetic Association Studies.}, journal = {J Am Stat Assoc}, volume = {112}, year = {2017}, month = {2017}, pages = {64-76}, abstract = {

It is of substantial interest to study the effects of genes, genetic pathways, and networks on the risk of complex diseases. These genetic constructs each contain multiple SNPs, which are often correlated and function jointly, and might be large in number. However, only a sparse subset of SNPs in a genetic construct is generally associated with the disease of interest. In this article, we propose the generalized higher criticism (GHC) to test for the association between an SNP set and a disease outcome. The higher criticism is a test traditionally used in high-dimensional signal detection settings when marginal test statistics are independent and the number of parameters is very large. However, these assumptions do not always hold in genetic association studies, due to linkage disequilibrium among SNPs and the finite number of SNPs in an SNP set in each genetic construct. The proposed GHC overcomes the limitations of the higher criticism by allowing for arbitrary correlation structures among the SNPs in an SNP-set, while performing accurate analytic -value calculations for any finite number of SNPs in the SNP-set. We obtain the detection boundary of the GHC test. We compared empirically using simulations the power of the GHC method with existing SNP-set tests over a range of genetic regions with varied correlation structures and signal sparsity. We apply the proposed methods to analyze the CGEM breast cancer genome-wide association study. Supplementary materials for this article are available online.

}, issn = {0162-1459}, doi = {10.1080/01621459.2016.1192039}, author = {Barnett, Ian and Mukherjee, Rajarshi and Lin, Xihong} } @article {18, title = {Genetic effects on gene expression across human tissues.}, journal = {Nature}, volume = {550}, year = {2017}, month = {2017 Oct 11}, pages = {204-213}, abstract = {

Characterization of the molecular function of the human genome and its variation across individuals is essential for identifying the cellular mechanisms that underlie human genetic traits and diseases. The Genotype-Tissue Expression (GTEx) project aims to characterize variation in gene expression levels across individuals and diverse tissues of the human body, many of which are not easily accessible. Here we describe genetic effects on gene expression levels across 44 human tissues. We find that local genetic variation affects gene expression levels for the majority of genes, and we further identify inter-chromosomal genetic effects for 93 genes and 112 loci. On the basis of the identified genetic effects, we characterize patterns of tissue specificity, compare local and distal effects, and evaluate the functional properties of the genetic effects. We also demonstrate that multi-tissue, multi-individual data can be used to identify genes and pathways affected by human disease-associated variation, enabling a mechanistic interpretation of gene regulation and the genetic basis of disease.

}, keywords = {Alleles, Chromosomes, Human, Disease, Female, Gene Expression Profiling, Gene Expression Regulation, Genetic Variation, Genome, Human, Genotype, Humans, Male, Organ Specificity, Quantitative Trait Loci}, issn = {1476-4687}, doi = {10.1038/nature24277}, author = {Battle, Alexis and Brown, Christopher D and Engelhardt, Barbara E and Montgomery, Stephen B} } @article {32, title = {Genetic identification of a common collagen disease in puerto ricans via identity-by-descent mapping in a health system.}, journal = {Elife}, volume = {6}, year = {2017}, month = {2017 09 12}, abstract = {

Achieving confidence in the causality of a disease locus is a complex task that often requires supporting data from both statistical genetics and clinical genomics. Here we describe a combined approach to identify and characterize a genetic disorder that leverages distantly related patients in a health system and population-scale mapping. We utilize genomic data to uncover components of distant pedigrees, in the absence of recorded pedigree information, in the multi-ethnic Bio biobank in New York City. By linking to medical records, we discover a locus associated with both elevated genetic relatedness and extreme short stature. We link the gene, , with a little-known genetic disease, previously thought to be rare and recessive. We demonstrate that disease manifests in both heterozygotes and homozygotes, indicating a common collagen disorder impacting up to 2\% of individuals of Puerto Rican ancestry, leading to a better understanding of the continuum of complex and Mendelian disease.

}, keywords = {Adolescent, Adult, Aged, Child, Collagen Diseases, Female, Fibrillar Collagens, Genotype, Heterozygote, Hispanic Americans, Homozygote, Humans, Male, Middle Aged, Molecular Epidemiology, Multigene Family, Musculoskeletal Diseases, New York City, Pedigree, Whole Genome Sequencing, Young Adult}, issn = {2050-084X}, doi = {10.7554/eLife.25060}, author = {Belbin, Gillian Morven and Odgis, Jacqueline and Sorokin, Elena P and Yee, Muh-Ching and Kohli, Sumita and Glicksberg, Benjamin S and Gignoux, Christopher R and Wojcik, Genevieve L and Van Vleck, Tielman and Jeff, Janina M and Linderman, Michael and Schurmann, Claudia and Ruderfer, Douglas and Cai, Xiaoqiang and Merkelson, Amanda and Justice, Anne E and Young, Kristin L and Graff, Misa and North, Kari E and Peters, Ulrike and James, Regina and Hindorff, Lucia and Kornreich, Ruth and Edelmann, Lisa and Gottesman, Omri and Stahl, Eli Ea and Cho, Judy H and Loos, Ruth Jf and Bottinger, Erwin P and Nadkarni, Girish N and Abul-Husn, Noura S and Kenny, Eimear E} } @article {30, title = {The impact of rare variation on gene expression across tissues.}, journal = {Nature}, volume = {550}, year = {2017}, month = {2017 10 11}, pages = {239-243}, abstract = {

Rare genetic variants are abundant in humans and are expected to contribute to individual disease risk. While genetic association studies have successfully identified common genetic variants associated with susceptibility, these studies are not practical for identifying rare variants. Efforts to distinguish pathogenic variants from benign rare variants have leveraged the genetic code to identify deleterious protein-coding alleles, but no analogous code exists for non-coding variants. Therefore, ascertaining which rare variants have phenotypic effects remains a major challenge. Rare non-coding variants have been associated with extreme gene expression in studies using single tissues, but their effects across tissues are unknown. Here we identify gene expression outliers, or individuals showing extreme expression levels for a particular gene, across 44 human tissues by using combined analyses of whole genomes and multi-tissue RNA-sequencing data from the Genotype-Tissue Expression (GTEx) project v6p release. We find that 58\% of underexpression and 28\% of overexpression outliers have nearby conserved rare variants compared to 8\% of non-outliers. Additionally, we developed RIVER (RNA-informed variant effect on regulation), a Bayesian statistical model that incorporates expression data to predict a regulatory effect for rare variants with higher accuracy than models using genomic annotations alone. Overall, we demonstrate that rare variants contribute to large gene expression changes across tissues and provide an integrative method for interpretation of rare variants in individual genomes.

}, keywords = {Bayes Theorem, Female, Gene Expression Profiling, Genetic Variation, Genome, Human, Genomics, Genotype, Humans, Male, Models, Genetic, Organ Specificity, Sequence Analysis, RNA}, issn = {1476-4687}, doi = {10.1038/nature24267}, author = {Li, Xin and Kim, Yungil and Tsang, Emily K and Davis, Joe R and Damani, Farhan N and Chiang, Colby and Hess, Gaelen T and Zappala, Zachary and Strober, Benjamin J and Scott, Alexandra J and Li, Amy and Ganna, Andrea and Bassik, Michael C and Merker, Jason D and Hall, Ira M and Battle, Alexis and Montgomery, Stephen B} } @article {26, title = {Improved methods for multi-trait fine mapping of pleiotropic risk loci.}, journal = {Bioinformatics}, volume = {33}, year = {2017}, month = {2017 Jan 15}, pages = {248-255}, abstract = {

MOTIVATION: Genome-wide association studies (GWAS) have identified thousands of regions in the genome that contain genetic variants that increase risk for complex traits and diseases. However, the variants uncovered in GWAS are typically not biologically causal, but rather, correlated to the true causal variant through linkage disequilibrium (LD). To discern the true causal variant(s), a variety of statistical fine-mapping methods have been proposed to prioritize variants for functional validation.

RESULTS: In this work we introduce a new approach, fastPAINTOR, that leverages evidence across correlated traits, as well as functional annotation data, to improve fine-mapping accuracy at pleiotropic risk loci. To improve computational efficiency, we describe an new importance sampling scheme to perform model inference. First, we demonstrate in simulations that by leveraging functional annotation data, fastPAINTOR increases fine-mapping resolution relative to existing methods. Next, we show that jointly modeling pleiotropic risk regions improves fine-mapping resolution compared to standard single trait and pleiotropic fine mapping strategies. We report a reduction in the number of SNPs required for follow-up in order to capture 90\% of the causal variants from 23 SNPs per locus using a single trait to 12 SNPs when fine-mapping two traits simultaneously. Finally, we analyze summary association data from a large-scale GWAS of lipids and show that these improvements are largely sustained in real data.

AVAILABILITY AND IMPLEMENTATION: The fastPAINTOR framework is implemented in the PAINTOR v3.0 package which is publicly available to the research community http://bogdan.bioinformatics.ucla.edu/software/paintor CONTACT: gkichaev@ucla.eduSupplementary information: Supplementary data are available at Bioinformatics online.

}, keywords = {Chromosome Mapping, Genetic Diseases, Inborn, Genetic Loci, Genetic Pleiotropy, Genome-Wide Association Study, Genomics, Humans, Linkage Disequilibrium, Lipid Metabolism, Models, Genetic, Polymorphism, Single Nucleotide, Software}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btw615}, author = {Kichaev, Gleb and Roytman, Megan and Johnson, Ruth and Eskin, Eleazar and Lindstr{\"o}m, Sara and Kraft, Peter and Pasaniuc, Bogdan} } @article {38, title = {Linkage disequilibrium-dependent architecture of human complex traits shows action of negative selection.}, journal = {Nat Genet}, volume = {49}, year = {2017}, month = {2017 Oct}, pages = {1421-1427}, abstract = {

Recent work has hinted at the linkage disequilibrium (LD)-dependent architecture of human complex traits, where SNPs with low levels of LD (LLD) have larger per-SNP heritability. Here we analyzed summary statistics from 56 complex traits (average N = 101,401) by extending stratified LD score regression to continuous annotations. We determined that SNPs with low LLD have significantly larger per-SNP heritability and that roughly half of this effect can be explained by functional annotations negatively correlated with LLD, such as DNase I hypersensitivity sites (DHSs). The remaining signal is largely driven by our finding that more recent common variants tend to have lower LLD and to explain more heritability (P = 2.38 {\texttimes} 10); the youngest 20\% of common SNPs explain 3.9 times more heritability than the oldest 20\%, consistent with the action of negative selection. We also inferred jointly significant effects of other LD-related annotations and confirmed via forward simulations that they jointly predict deleterious effects.

}, keywords = {Alleles, Chi-Square Distribution, Datasets as Topic, Genetic Fitness, Genetic Variation, Humans, Linkage Disequilibrium, Models, Genetic, Molecular Sequence Annotation, Multifactorial Inheritance, Polymorphism, Single Nucleotide, Selection, Genetic}, issn = {1546-1718}, doi = {10.1038/ng.3954}, author = {Gazal, Steven and Finucane, Hilary K and Furlotte, Nicholas A and Loh, Po-Ru and Palamara, Pier Francesco and Liu, Xuanyao and Schoech, Armin and Bulik-Sullivan, Brendan and Neale, Benjamin M and Gusev, Alexander and Price, Alkes L} } @article {9, title = {Population- and individual-specific regulatory variation in Sardinia.}, journal = {Nat Genet}, volume = {49}, year = {2017}, month = {2017 May}, pages = {700-707}, abstract = {

Genetic studies of complex traits have mainly identified associations with noncoding variants. To further determine the contribution of regulatory variation, we combined whole-genome and transcriptome data for 624 individuals from Sardinia to identify common and rare variants that influence gene expression and splicing. We identified 21,183 expression quantitative trait loci (eQTLs) and 6,768 splicing quantitative trait loci (sQTLs), including 619 new QTLs. We identified high-frequency QTLs and found evidence of selection near genes involved in malarial resistance and increased multiple sclerosis risk, reflecting the epidemiological history of Sardinia. Using family relationships, we identified 809 segregating expression outliers (median z score of 2.97), averaging 13.3 genes per individual. Outlier genes were enriched for proximal rare variants, providing a new approach to study large-effect regulatory variants and their relevance to traits. Our results provide insight into the effects of regulatory variants and their relationship to population history and individual genetic risk.

}, keywords = {Alternative Splicing, Chromosome Mapping, Family Health, Female, Gene Expression Profiling, Genetic Predisposition to Disease, Genetic Variation, Genetics, Population, Genome-Wide Association Study, Genotype, Humans, Italy, Male, Polymorphism, Single Nucleotide, Quantitative Trait Loci, Transcription Initiation Site}, issn = {1546-1718}, doi = {10.1038/ng.3840}, author = {Pala, Mauro and Zappala, Zachary and Marongiu, Mara and Li, Xin and Davis, Joe R and Cusano, Roberto and Crobu, Francesca and Kukurba, Kimberly R and Gloudemans, Michael J and Reinier, Frederic and Berutti, Riccardo and Piras, Maria G and Mulas, Antonella and Zoledziewska, Magdalena and Marongiu, Michele and Sorokin, Elena P and Hess, Gaelen T and Smith, Kevin S and Busonero, Fabio and Maschio, Andrea and Steri, Maristella and Sidore, Carlo and Sanna, Serena and Fiorillo, Edoardo and Bassik, Michael C and Sawcer, Stephen J and Battle, Alexis and Novembre, John and Jones, Chris and Angius, Andrea and Abecasis, Gon{\c c}alo R and Schlessinger, David and Cucca, Francesco and Montgomery, Stephen B} } @article {16, title = {Testing for the indirect effect under the null for genome-wide mediation analyses.}, journal = {Genet Epidemiol}, volume = {41}, year = {2017}, month = {2017 Dec}, pages = {824-833}, abstract = {

Mediation analysis helps researchers assess whether part or all of an exposure{\textquoteright}s effect on an outcome is due to an intermediate variable. The indirect effect can help in designing interventions on the mediator as opposed to the exposure and better understanding the outcome{\textquoteright}s mechanisms. Mediation analysis has seen increased use in genome-wide epidemiological studies to test for an exposure of interest being mediated through a genomic measure such as gene expression or DNA methylation (DNAm). Testing for the indirect effect is challenged by the fact that the null hypothesis is composite. We examined the performance of commonly used mediation testing methods for the indirect effect in genome-wide mediation studies. When there is no association between the exposure and the mediator and no association between the mediator and the outcome, we show that these common tests are overly conservative. This is a case that will arise frequently in genome-wide mediation studies. Caution is hence needed when applying the commonly used mediation tests in genome-wide mediation studies. We evaluated the performance of these methods using simulation studies, and performed an epigenome-wide mediation association study in the Normative Aging Study, analyzing DNAm as a mediator of the effect of pack-years on FEV .

}, keywords = {Basic Helix-Loop-Helix Transcription Factors, DNA Methylation, Epigenomics, Genome-Wide Association Study, Humans, Lung Neoplasms, Models, Genetic, Repressor Proteins}, issn = {1098-2272}, doi = {10.1002/gepi.22084}, author = {Barfield, Richard and Shen, Jincheng and Just, Allan C and Vokonas, Pantel S and Schwartz, Joel and Baccarelli, Andrea A and VanderWeele, Tyler J and Lin, Xihong} } @article {25, title = {VEXOR: an integrative environment for prioritization of functional variants in fine-mapping analysis.}, journal = {Bioinformatics}, volume = {33}, year = {2017}, month = {2017 May 01}, pages = {1389-1391}, abstract = {

MOTIVATION: The identification of the functional variants responsible for observed genome-wide association studies (GWAS) signals is one of the most challenging tasks of the post-GWAS research era. Several tools have been developed to annotate genetic variants by their genomic location and potential functional implications. Each of these tools has its own requirements and internal logic, which forces the user to become acquainted with each interface.

RESULTS: From an awareness of the amount of work needed to analyze a single locus, we have built a flexible, versatile and easy-to-use web interface designed to help in prioritizing variants and predicting their potential functional implications. This interface acts as a single-point of entry linking association results with reference tools and relevant experiments.

AVAILABILITY AND IMPLEMENTATION: VEXOR is an integrative web application implemented through the Shiny framework and available at: http://romix.genome.ulaval.ca/vexor.

CONTACT: arnaud.droit@crchuq.ulaval.ca.

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.

}, keywords = {Genome, Human, Genome-Wide Association Study, Genomics, Humans, Polymorphism, Genetic, Sequence Analysis, DNA, Software}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btw826}, author = {Lema{\c c}on, Audrey and Joly Beauparlant, Charles and Soucy, Penny and Allen, Jamie and Easton, Douglas and Kraft, Peter and Simard, Jacques and Droit, Arnaud} } @article {47, title = {Weighted pseudolikelihood for SNP set analysis with multiple secondary outcomes in case-control genetic association studies.}, journal = {Biometrics}, volume = {73}, year = {2017}, month = {2017 12}, pages = {1210-1220}, abstract = {

We propose a weighted pseudolikelihood method for analyzing the association of a SNP set, example, SNPs in a gene or a genetic pathway or network, with multiple secondary phenotypes in case-control genetic association studies. To boost analysis power, we assume that the SNP-specific effects are shared across all secondary phenotypes using a scaled mean model. We estimate regression parameters using Inverse Probability Weighted (IPW) estimating equations obtained from the weighted pseudolikelihood, which accounts for case-control sampling to prevent potential ascertainment bias. To test the effect of a SNP set, we propose a weighted variance component pseudo-score test. We also propose a penalized IPW pseudolikelihood method for selecting a subset of SNPs that are associated with the multiple secondary phenotypes. We show that the proposed variable selection procedure has the oracle properties and is robust to misspecification of the correlation structure among secondary phenotypes. We select the tuning parameter using a weighted Bayesian Information-like Criterion (wBIC). We evaluate the finite sample performance of the proposed methods via simulations, and illustrate the methods by the analysis of the multiple secondary smoking behavior outcomes in a lung cancer case-control genetic association study.

}, keywords = {Case-Control Studies, Computer Simulation, Genetic Association Studies, Humans, Likelihood Functions, Lung Neoplasms, Phenotype, Polymorphism, Single Nucleotide, Smoking}, issn = {1541-0420}, doi = {10.1111/biom.12680}, author = {Sofer, Tamar and Schifano, Elizabeth D and Christiani, David C and Lin, Xihong} }