Source code for limmbo.io.input

from limix.util.preprocess import regressOut

from limmbo.utils.utils import verboseprint
from limmbo.utils.utils import match
from limmbo.utils.utils import scale
from limmbo.utils.utils import makeHardCalledGenotypes
from limmbo.utils.utils import AlleleFrequencies

import pandas as pd
import numpy as np
import re

from scipy_sugar.stats import quantile_gaussianize
from math import sqrt

class MissingInput(Exception):
    """Raised when no appropriate input is given"""
    pass


class FormatError(Exception):
    """Raised when no appropriate input is given"""
    pass


class DataMismatch(Exception):
    """Raised when dimensions of sample/ID names do not match dimension of
    corresponding data"""
    pass


[docs]class InputData(object): """ Generate object containing all datasets relevant for variance decomposition (phenotypes, relatedness estimates) and pre-processing steps (check for common samples and sample order, covariates regression and phenotype transformation) Arguments: verbose (bool): initialise verbose: should progress messages be printed to stdout """ def __init__(self, verbose = True): self.verbose = verbose self.samples = None self.phenotypes = None self.pheno_samples = None self.phenotype_ID = None self.covariates = None self.covariate_samples = None self.relatedness = None self.relatedness_samples = None self.pcs = None self.pc_samples = None self.snps = None self.genotypes = None self.geno_samples = None self.genotypes_info = None self.Cg = None self.Cn = None
[docs] def addPhenotypes(self, phenotypes, pheno_samples=None, phenotype_ID=None): """ Add phenotypes, their phenotype ID and their sample IDs to InputData instance Arguments: phenotypes (array-like): [`N x `P`] phenotype matrix of `N` individuals and `P` phenotypes; if pandas.DataFrame with pheno_samples as index and phenotypes_ID as columns, pheno_samples and phenotype_ID do not have to specified separately. pheno_samples (array-like): [`N`] sample ID phenotype_ID (array-like): [`P`] phenotype IDs Returns: None: updated the following attributes of the InputData instance: - **self.phenotypes** (pd.DataFrame): [`N` x `P`] phenotype array - **self.pheno_samples** (np.array): [`N`] sample IDs - **self.phenotype_ID** (np.array): [`P`] phenotype IDs Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy as np >>> import pandas as pd >>> pheno = np.array(((1,2),(7,1),(3,4))) >>> pheno_samples = ['S1','S2', 'S3'] >>> phenotype_ID = ['ID1','ID2'] >>> phenotypes = pd.DataFrame(pheno, index=pheno_samples, ... columns = phenotype_ID) >>> indata = input.InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = phenotypes) >>> print indata.phenotypes.shape (3, 2) >>> print indata.pheno_samples.shape (3,) >>> print indata.phenotype_ID.shape (2,) """ if pheno_samples is None: try: self.pheno_samples = np.array(phenotypes.index) except Exception: raise TypeError(("pheno_samples are not provided and " "phenotypes has no index to retrieve pheno_samples from.")) else: self.pheno_samples = np.array(pheno_samples) if phenotype_ID is None: try: self.phenotype_ID = np.array(phenotypes.columns) except Exception: raise TypeError(("phenotype_ID are not provided and phenotypes " "has no column names to retrieve phenotype_ID from.")) else: self.phenotype_ID = np.array(phenotype_ID) if phenotypes.shape[0] != self.pheno_samples.shape[0]: raise DataMismatch(('Number of samples in phenotypes ({}) does ' 'not match number of sample IDs ({}) provided').format( phenotypes.shape[0], self.pheno_samples.shape[0])) if phenotypes.shape[1] != self.phenotype_ID.shape[0]: raise DataMismatch(('Number phenotypes ({}) does not match ' 'number of phenotype IDs ({}) provided').format( phenotypes.shape[1], self.phenotype_ID.shape[0])) if len(self.pheno_samples) != len(set(self.pheno_samples)): raise IOError("Duplicate sample names in phenotypes") if len(self.phenotype_ID) != len(set(self.phenotype_ID)): raise IOError("Duplicate trait names in phenotypes") self.phenotypes = pd.DataFrame(phenotypes, index=self.pheno_samples,
columns = self.phenotype_ID)
[docs] def addCovariates(self, covariates, covs_samples = None): """ Add [`N` x `K`] covariate data with [`N`] samples and [`K`] covariates to InputData instance. Arguments: covariates (array-like): [`N x `K`] covariate matrix of `N` individuals and `K` covariates; if pandas.DataFrame with covs_samples as index, covs_samples do not have to specified separately. covs_samples (array-like): [`N`] sample ID Returns: None: updated the following attributes of the InputData instance: - **self.covariates** (pd.DataFrame): [`N` x `K`] covariates matrix - **self.covs_samples** (np.array): [`N`] sample IDs Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy as np >>> import pandas as pd >>> covariates = [(1,2,4),(1,1,6),(0,4,8)] >>> covs_samples = ['S1','S2', 'S3'] >>> covariates = pd.DataFrame(covariates, index=covs_samples) >>> indata = input.InputData(verbose=False) >>> indata.addCovariates(covariates = covariates, ... covs_samples = covs_samples) >>> print indata.covariates.shape (3, 3) >>> print indata.covs_samples.shape (3,) """ if covs_samples is None: try: self.covs_samples = np.array(covariates.index) except Exception: raise TypeError(("covs_samples are not provided and " "covariates has no index to retrieve covs_samples " "from.")) else: self.covs_samples = np.array(covs_samples) if np.array(covariates).shape[0] != np.array(self.covs_samples).shape[0]: raise DataMismatch(('Number of samples in covariates ({}) does ' 'not match number of sample IDs ({}) provided').format( np.array(covariates).shape[0], np.array(self.covs_samples).shape[0])) if len(self.covs_samples) != len(set(self.covs_samples)): raise IOError("Duplicate sample names in covariates")
self.covariates = pd.DataFrame(covariates, index=self.covs_samples)
[docs] def addRelatedness(self, relatedness, relatedness_samples = None): """ Add [`N` x `N`] pairwise relatedness estimates of [`N`] samples to the InputData instance Arguments: relatedness (array-like): [`N x `N`] relatedness matrix of `N` individuals; if pandas.DataFrame with relatedness_samples as index, relatedness_samples do not have to specified separately. relatedness_samples (array-like): [`N`] sample IDs Returns: None: updated the following attributes of the InputData instance: - **self.relatedness** (pd.DataFrame): [`N` x `N`] relatedness matrix - **self.relatedness_samples** (np.array): [`N`] sample IDs Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy >>> import pandas as pd >>> from numpy.random import RandomState >>> from numpy.linalg import cholesky as chol >>> random = RandomState(5) >>> N = 100 >>> SNP = 1000 >>> X = (random.rand(N, SNP) < 0.3).astype(float) >>> relatedness = numpy.dot(X, X.T)/float(SNP) >>> relatedness_samples = numpy.array( ... ['S{}'.format(x+1) for x in range(N)]) >>> relatedness = pd.DataFrame(relatedness, ... index=relatedness_samples) >>> indata = input.InputData(verbose=False) >>> indata.addRelatedness(relatedness = relatedness) >>> print indata.relatedness.shape (100, 100) >>> print indata.relatedness_samples.shape (100,) """ if relatedness_samples is None: try: self.relatedness_samples = np.array(relatedness.index) except Exception: raise TypeError(("relatedness_samples are not provided and " "relatedness has no index to retrieve relatedness_samples " "from")) else: self.relatedness_samples = np.array(relatedness_samples) rel = np.array(relatedness) if rel.shape[0] != rel.shape[1]: raise FormatError(('Relatedness has to be a square matrix, but ' 'number of rows {} is not equal to number of columns ' '{}').format(rel.shape[0], rel.shape[1])) if not np.all(np.array(rel) - np.array(rel).T == 0): raise FormatError('Relatedness matrix is not symmetric') if not self._is_positive_definite(rel): raise FormatError('Relatedness matrix is not positive-semi definite') if rel.shape[0] != self.relatedness_samples.shape[0]: raise DataMismatch(('Number of samples in relatedness ({}) does ' 'not match number of sample IDs ({}) provided').format( rel.shape[0], self.relatedness_samples.shape[0])) if len(self.relatedness_samples) != len(set(self.relatedness_samples)): raise IOError("Duplicate sample names in relatedness") self.relatedness = pd.DataFrame(relatedness,
index=self.relatedness_samples, columns=self.relatedness_samples)
[docs] def addGenotypes(self, genotypes, geno_samples = None, genotypes_info = None): """ Add [`N` x `NrSNP`] genotype array of [`N`] samples and [`NrSNP`] genotypes, [`N`] array of sample IDs and [`NrSNP` x 2] dataframe of genotype description to InputData instance. Arguments: genotypes (array-like): [`N` x `NrSNP`] genotype array of [`N`] samples and [`NrSNP`] genotypes; if pandas.DataFrame with geno_samples as index, geno_samples do not have to specified separately. geno_samples (array-like): [`N`] vector of `N` sample IDs genotypes_info (dataframe): [`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and rsIDs as index Returns: None: updated the following attributes of the InputData instance: - **self.genotypes** (pd.DataFrame): [`N` x `NrSNPs`] genotype matrix - **self.geno_samples** (np.array): [`N`] sample IDs - **self.genotypes_info** (pd.DataFrame): [`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and rsIDs as index Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io import reader >>> from limmbo.io import input >>> data = reader.ReadData(verbose=False) >>> file_geno = resource_filename('limmbo', ... 'io/test/data/genotypes.csv') >>> data.getGenotypes(file_genotypes=file_geno) >>> indata = input.InputData(verbose=False) >>> indata.addGenotypes(genotypes=data.genotypes, ... genotypes_info=data.genotypes_info) >>> indata.geno_samples[:5] array(['ID_1', 'ID_2', 'ID_3', 'ID_4', 'ID_5'], dtype=object) >>> indata.genotypes.shape (1000, 20) >>> indata.genotypes.values[:5,:5] array([[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [2., 1., 0., 0., 0.], [1., 0., 0., 0., 0.]]) >>> indata.genotypes_info[:5] chrom pos rs1601111 3 88905003 rs13270638 8 20286021 rs75132935 8 76564608 rs72668606 8 79733124 rs55770986 7 2087823 """ if geno_samples is None: try: self.geno_samples = np.array(genotypes.index) except Exception: raise TypeError(("geno_samples are not provided and genotypes " "has no index to retrieve geno_samples from")) else: self.geno_samples = np.array(geno_samples) if genotypes_info is None: raise MissingInput(('Genotype info has to be specified via ' 'genotypes_info')) self.genotypes = pd.DataFrame(genotypes, index=self.geno_samples) self.genotypes_info = genotypes_info if self.genotypes.shape[0] != self.geno_samples.shape[0]: raise DataMismatch(('Number of samples in genotypes ({}) does ' 'not match number of sample IDs ({}) provided').format( self.genotypes.shape[0], self.geno_samples.shape[0])) if self.genotypes.shape[1] != self.genotypes_info.shape[0]: raise DataMismatch(('Number of genotypes in genotypes ({}) does ' 'not match number of genotypes in genotypes_info ({})').format( self.genotypes.shape[1], self.genotypes_info.shape[0])) if len(self.geno_samples) != len(set(self.geno_samples)):
raise IOError("Duplicate sample names in genotypes")
[docs] def addVarianceComponents(self, Cg, Cn,): """ Add [`P` x `P`] matrices of [`P`] trait covariance estimates of the genetic trait variance component (Cg) and the non-genetic (noise) variance component (Cn) to InputData instance Arguments: Cg (array-like): [`P x `P`] matrix of `P` trait covariance estimates of the genetic trait covaraince component Cn (array-like): [`P x `P`] matrix of `P` trait covariance estimates of the non-genetic (noise) trait covaraince component Returns: None: updated the following attributes of the InputData instance: - **self.Cg** (np.array): [`P x `P`] matrix of `P` trait covariance estimates of the genetic trait covariance component - **self.Cn** (np.array): [`P x `P`] matrix of `P` trait covariance estimates of the non-genetic trait covaraince component Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io import reader >>> from limmbo.io import input >>> import numpy as np >>> from numpy.random import RandomState >>> from numpy.linalg import cholesky as chol >>> data = reader.ReadData(verbose=False) >>> file_pheno = resource_filename('limmbo', ... 'io/test/data/pheno.csv') >>> data.getPhenotypes(file_pheno=file_pheno) >>> file_Cg = resource_filename('limmbo', ... 'io/test/data/Cg.csv') >>> file_Cn = resource_filename('limmbo', ... 'io/test/data/Cn.csv') >>> data.getVarianceComponents(file_Cg=file_Cg, ... file_Cn=file_Cn) >>> indata = input.InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = data.phenotypes) >>> indata.addVarianceComponents(Cg = data.Cg, Cn=data.Cn) >>> print indata.Cg.shape (10, 10) >>> print indata.Cg.shape (10, 10) """ if Cg is not None and Cn is not None: if self.phenotypes is None: raise FormatError(('Phenotypes have to be added before Cg/Cn ' 'can be added')) self.Cg = np.array(Cg) self.Cn = np.array(Cn) if self.Cg.shape[0] != self.Cg.shape[1]: raise FormatError(('Cg has to be a square matrix, but ' 'number of rows {} is not equal to number of columns ' '{}').format(self.Cg.shape[0], self.Cg.shape[1])) if not np.all(self.Cg - self.Cg.T == 0): raise FormatError('Cg is not symmetric') if not self._is_positive_definite(self.Cg): raise FormatError('Cg is not positive-semi definite') if self.Cg.shape[0] != self.phenotypes.shape[1]: raise DataMismatch(('Number of traits in Cg ({}) does ' 'not match number of traits ({}) in phenotypes').format( self.Cg.shape[0], self.phenotypes.shape[1])) if self.Cn.shape[0] != self.Cn.shape[1]: raise FormatError(('Cn has to be a square matrix, but ' 'number of rows {} is not equal to number of columns ' '{}').format(self.Cn.shape[0], self.Cn.shape[1])) if not np.all(self.Cn - self.Cn.T == 0): raise FormatError('Cn is not symmetric') if not self._is_positive_definite(self.Cn): raise FormatError('Cn is not positive-semi definite') if self.Cn.shape[0] != self.phenotypes.shape[1]: raise DataMismatch(('Number of traits in Cn ({}) does ' 'not match number of traits ({}) in phenotypes').format(
self.Cn.shape[0], self.phenotypes.shape[1]))
[docs] def addPCs(self, pcs, pc_samples = None): """ Add [`N` x `PC`] matrix of [`PC`] principal components from the genotypes of [`N`] samples to InputData instance. Arguments: pcs (array-like): [`N x `PCs`] principal component matrix of `N` individuals and `PCs` principal components; if pandas.DataFrame with pc_samples as index, covs_samples do not have to specified separately. pc_samples (array-like): [`N`] sample IDs Returns: None: updated the following attributes of the InputData instance: - **self.pcs** (pd.DataFrame): [`N` x `PCs`] principal component matrix - **self.pc_samples** (np.array): [`N`] sample IDs Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io import reader >>> from limmbo.io import input >>> data = reader.ReadData(verbose=False) >>> file_pcs = resource_filename('limmbo', ... 'io/test/data/pcs.csv') >>> data.getPCs(file_pcs=file_pcs, nrpcs=10, delim=" ") >>> indata = input.InputData(verbose=False) >>> indata.addPCs(pcs = data.pcs) >>> print indata.pcs.shape (1000, 10) >>> print indata.pc_samples.shape (1000,) """ if pc_samples is None: try: self.pc_samples = np.array(pcs.index) except Exception: raise TypeError(("pc_samples are not provided and pcs has " "no index to retrieve pc_samples from")) else: self.pc_samples = np.array(pc_samples) if np.array(pcs).shape[0] != np.array(self.pc_samples).shape[0]: raise DataMismatch(('Number of samples in pcs ({}) does' 'not match number of sample IDs ({}) provided').format( np.array(pcs).shape[0], np.array(pc_samples).shape[0])) if len(self.pc_samples) != len(set(self.pc_samples)): raise IOError("Duplicate sample names for principle components")
self.pcs = pd.DataFrame(pcs, index=self.pc_samples)
[docs] def subsetTraits(self, traitlist = None): """ Limit analysis to specific subset of traits Arguments: traitlist (array-like): array of trait numbers to select from phenotypes Returns: None: updated the following attributes of the InputData instance: - **self.traitlist** (list): of [`t`] trait numbers (int) to choose for analysis - **self.phenotypes** (pd.DataFrame): reduced set of [`N` x `t`] phenotypes - **self.phenotype.ID** (np.array): reduced set of [`t`] phenotype IDs Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io.reader import ReadData >>> from limmbo.io.input import InputData >>> from limmbo.io.utils import file_type >>> data = ReadData(verbose=False) >>> file_pheno = resource_filename('limmbo', ... 'io/test/data/pheno.csv') >>> data.getPhenotypes(file_pheno=file_pheno) >>> traitlist = data.getTraitSubset(traitstring="1-3,5") >>> indata = InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = data.phenotypes) >>> print indata.phenotypes.shape (1000, 10) >>> print indata.phenotype_ID.shape (10,) >>> indata.subsetTraits(traitlist=traitlist) >>> print indata.phenotypes.shape (1000, 4) >>> print indata.phenotype_ID.shape (4,) """ self.traitlist = np.array(traitlist) if len(self.traitlist) != len(set(self.traitlist)): raise IOError("Duplicate trait names in traitlist") try: self.phenotypes = self.phenotypes.iloc[:, self.traitlist] self.phenotype_ID = self.phenotype_ID[self.traitlist] except: raise DataMismatch(('Selected trait number {} is greater ' 'than number of phenotypes provided {}').format( max(self.traitlist) + 1,
self.phenotypes.shape[1]))
[docs] def commonSamples(self, samplelist=None): """ Get [`M]` common samples out of phenotype, relatedness and optional covariates with [`N`] samples (if all samples present in all datasets [`M`] = [`N`]) and ensure that samples are in same order. Arguments: samplelist (array-like): array of sample IDs to select from data Returns: None: updated the following attributes of the InputData instance: - **self.phenotypes** (pd.DataFrame): [`M` x `P`] phenotype matrix - **self.pheno_samples** (np.array): [`M`] sample IDs - **self.relatedness** (pd.DataFrame): [`M x M`] relatedness matrix - **self.relatedness_samples** (np.array): [`M`] sample IDs of relatedness matrix - **self.covariates** (pd.DataFrame): [`M` x `K`] covariates matrix - **self.covs_samples** (np.array): [`M`] sample IDs - **self.genotypes** (pd.DataFrame): [`M` x `NrSNPs`] genotypes matrix - **self.geno_samples** (np.array): [`M`] sample IDs - **self.pcs** (pd.DataFrame): [`M` x `PCs`] principal component matrix - **self.pc_samples** (np.array): [`M`] sample IDs Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy as np >>> import pandas as pd >>> from numpy.random import RandomState >>> from numpy.linalg import cholesky as chol >>> random = RandomState(5) >>> P = 2 >>> K = 4 >>> N = 10 >>> SNP = 1000 >>> pheno = random.normal(0,1, (N, P)) >>> pheno_samples = np.array(['S{}'.format(x+4) ... for x in range(N)]) >>> phenotype_ID = np.array(['ID{}'.format(x+1) ... for x in range(P)]) >>> phenotypes = pd.DataFrame(pheno, index=pheno_samples, ... columns=phenotype_ID) >>> X = (random.rand(N, SNP) < 0.3).astype(float) >>> relatedness = np.dot(X, X.T)/float(SNP) >>> relatedness_samples = np.array(['S{}'.format(x+1) ... for x in range(N)]) >>> covariates = random.normal(0,1, (N-2, K)) >>> covs_samples = np.array(['S{}'.format(x+1) ... for x in range(N-2)]) >>> indata = input.InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = pheno, ... pheno_samples = pheno_samples, ... phenotype_ID = phenotype_ID) >>> indata.addRelatedness(relatedness = relatedness, ... relatedness_samples = relatedness_samples) >>> indata.addCovariates(covariates = covariates, ... covs_samples = covs_samples) >>> indata.covariates.shape (8, 4) >>> indata.phenotypes.shape (10, 2) >>> indata.relatedness.shape (10, 10) >>> indata.commonSamples(samplelist=["S4", "S6", "S5"]) >>> indata.covariates.shape (3, 4) >>> indata.phenotypes.shape (3, 2) >>> indata.relatedness.shape (3, 3) """ self.samples = self.pheno_samples if self.relatedness is not None: test_pheno_relatedness = np.intersect1d(self.pheno_samples, self.relatedness_samples) if len(test_pheno_relatedness) == 0: raise DataMismatch(('No common samples between phenotypes and ' 'relatedness estimates')) self.samples = test_pheno_relatedness if self.genotypes is not None: test_pheno_geno = np.intersect1d(self.pheno_samples, self.geno_samples) if len(test_pheno_geno) == 0: raise DataMismatch(('No common samples between phenotypes,' 'and genotypes')) self.samples = np.intersect1d(self.samples, test_pheno_geno) if self.covariates is not None: test_pheno_covs = np.intersect1d(self.pheno_samples, self.covs_samples) if len(test_pheno_covs) == 0: raise DataMismatch(('No common samples between phenotypes, ' 'and covariates')) self.samples = np.intersect1d(self.samples, test_pheno_covs) if self.pcs is not None: test_pheno_pcs = np.intersect1d(self.pheno_samples, self.pcs_samples) if len(test_pheno_pcs) == 0: raise DataMismatch(('No common samples between phenotypes,' 'and pcs')) self.samples = np.intersect1d(self.samples, test_pheno_pcs) if samplelist is not None: if len(samplelist) != len(set(samplelist)): raise IOError("Duplicate sample names in samplelist") test_samples_samplelist = np.intersect1d(self.samples, samplelist) if len(test_samples_samplelist) == 0: raise DataMismatch(('No samples between common samples in, ' 'datasets and samplelist')) if len(test_samples_samplelist) < len(samplelist): raise DataMismatch(('Not all Ids in samplelist are contained ' 'in common samples from provided datasets')) self.samples = samplelist self.phenotypes = self.phenotypes.loc[self.samples,:] self.pheno_samples = np.array(self.phenotypes.index) if self.genotypes is not None: self.genotypes = self.genotypes.loc[self.samples,:] self.geno_samples = np.array(self.genotypes.index) if self.relatedness is not None: self.relatedness = self.relatedness.loc[self.samples,:] self.relatedness = self.relatedness[self.samples] self.relatedness_samples = np.array(self.relatedness.index) if self.covariates is not None: self.covariates = self.covariates.loc[self.samples,:] self.covs_samples = np.array(self.covariates.index) if self.pcs is not None: self.pcs = self.pcs.loc[self.samples,:]
self.pc_samples = np.array(self.pcs.index)
[docs] def regress(self): """ Regress out covariates (optional). Returns: None: updated the following attributes of the InputData instance: - **self.phenotypes** (np.array): [`M` x `P`] phenotype matrix of residuals of linear model - **self.covariates**: None Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy as np >>> from numpy.random import RandomState >>> from numpy.linalg import cholesky as chol >>> random = RandomState(5) >>> P = 5 >>> K = 4 >>> N = 100 >>> pheno = random.normal(0,1, (N, P)) >>> pheno_samples = np.array(['S{}'.format(x+1) ... for x in range(N)]) >>> phenotype_ID = np.array(['ID{}'.format(x+1) ... for x in range(P)]) >>> covariates = random.normal(0,1, (N, K)) >>> covs_samples = np.array(['S{}'.format(x+1) ... for x in range(N)]) >>> indata = input.InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = pheno, ... pheno_samples = pheno_samples, ... phenotype_ID = phenotype_ID) >>> indata.addCovariates(covariates = covariates, ... covs_samples = covs_samples) >>> indata.phenotypes.values[:3, :3] array([[ 0.44122749, -0.33087015, 2.43077119], [ 1.58248112, -0.9092324 , -0.59163666], [-1.19276461, -0.20487651, -0.35882895]]) >>> indata.regress() >>> indata.phenotypes.values[:3, :3] array([[ 0.34421705, -0.01470998, 2.25710966], [ 1.69886647, -1.41756814, -0.55614649], [-1.10700674, -0.66017713, -0.22201814]]) """ if np.array_equal(self.phenotypes, self.covariates): raise DataMismatch(('Phenotype and covariate arrays are ' 'identical')) verboseprint('Regress covariates', verbose=self.verbose) phenotypes = regressOut(np.array(self.phenotypes), np.array(self.covariates)) self.phenotypes = pd.DataFrame(phenotypes, index=self.phenotypes.index, columns=self.phenotypes.columns)
self.covariates = None
[docs] def transform(self, transform): """ Transform phenotypes Arguments: transform (string): transformation method for phenotype data: - scale: mean center, divide by sd - gaussian: inverse normalisation Returns: None: updated the following attributes of the InputData instance: - **self.phenotypes** (np.array): [`N` x `P`] (transformed) phenotype matrix Examples: .. doctest:: >>> from limmbo.io import input >>> import numpy as np >>> from numpy.random import RandomState >>> from numpy.linalg import cholesky as chol >>> random = RandomState(5) >>> P = 5 >>> K = 4 >>> N = 100 >>> pheno = random.normal(0,1, (N, P)) >>> pheno_samples = np.array(['S{}'.format(x+1) ... for x in range(N)]) >>> phenotype_ID = np.array(['ID{}'.format(x+1) ... for x in range(P)]) >>> SNP = 1000 >>> X = (random.rand(N, SNP) < 0.3).astype(float) >>> relatedness = np.dot(X, X.T)/float(SNP) >>> relatedness_samples = np.array(['S{}'.format(x+1) ... for x in range(N)]) >>> indata = input.InputData(verbose=False) >>> indata.addPhenotypes(phenotypes = pheno, ... pheno_samples = pheno_samples, ... phenotype_ID = phenotype_ID) >>> indata.addRelatedness(relatedness = relatedness, ... relatedness_samples = relatedness_samples) >>> indata.phenotypes.values[:3, :3] array([[ 0.44122749, -0.33087015, 2.43077119], [ 1.58248112, -0.9092324 , -0.59163666], [-1.19276461, -0.20487651, -0.35882895]]) >>> indata.transform(transform='gaussian') >>> indata.phenotypes.values[:3, :3] array([[ 0.23799988, -0.11191464, 2.05785598], [ 1.41041953, -0.81365681, -0.92217818], [-1.55977999, 0.01240937, -0.62091817]]) """ if transform == 'scale': verboseprint('Use %s as transformation' % transform, verbose=self.verbose) phenotypes = scale(self.phenotypes) self.phenotypes = pd.DataFrame(phenotypes, index=self.phenotypes.index, columns=self.phenotypes.columns) elif transform == 'gaussian': verboseprint('Use %s as transformation' % transform, verbose=self.verbose) phenotypes = np.apply_along_axis(quantile_gaussianize, 0, self.phenotypes) self.phenotypes = pd.DataFrame(phenotypes, index=self.phenotypes.index, columns=self.phenotypes.columns) else: raise TypeError(('Possible transformation methods are: scale, '
'and gaussian but {} provided').format(transform))
[docs] def standardiseGenotypes(self): r""" Standardise genotypes: .. math:: w_{ij} = \frac{x_{ij} -2p_i}{\sqrt{2p_i (1-p_i)}} where :math:`x_{ij}` is the number of copies of the reference allele for the :math:`i` th SNP of the :math:`j` th individual and :math:`p_i` is the frequency of the reference allele (as described in `(Yang et al 2011) <http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3014363&tool=pmcentrez&rendertype=abstract>`_). Returns: None: updated the following attributes of the InputData instance: - **self.genotypes_sd** (numpy array): [`N` x `NrSNP`] matrix of `NrSNP` standardised genotypes for `N` samples. Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io import reader >>> from limmbo.io import input >>> from limmbo.utils.utils import makeHardCalledGenotypes >>> from limmbo.utils.utils import AlleleFrequencies >>> data = reader.ReadData(verbose=False) >>> file_geno = resource_filename('limmbo', ... 'io/test/data/genotypes.csv') >>> data.getGenotypes(file_genotypes=file_geno) >>> indata = input.InputData(verbose=False) >>> indata.addGenotypes(genotypes=data.genotypes, ... genotypes_info=data.genotypes_info) >>> geno_sd = indata.standardiseGenotypes() >>> geno_sd.iloc[:5,:3] 0 1 2 ID_1 -2.201123 -2.141970 -8.9622 ID_2 -2.201123 -2.141970 -8.9622 ID_3 -2.201123 -2.141970 -8.9622 ID_4 0.908627 -0.604125 -8.9622 ID_5 -0.646248 -2.141970 -8.9622 """ self.genotypes_sd = np.zeros(self.genotypes.shape) for snp in range(self.genotypes.shape[1]): p, q = AlleleFrequencies(self.genotypes.iloc[:, snp]) var_snp = sqrt(2 * p * q) for n in range(self.genotypes.iloc[:, snp].shape[0]): self.genotypes_sd[n, snp] = (np.array(self.genotypes)[n, snp] - 2 * q) / var_snp self.genotypes_sd = pd.DataFrame(self.genotypes_sd, index=self.genotypes.index)
return self.genotypes_sd
[docs] def getAlleleFrequencies(self): """ Compute allele frequencies of genotypes. Returns: None: updated the following attributes of the InputData instance: - **self.freqs** (pandas DataFrame): [`NrSNP` x `2`] matrix of alt and ref allele frequencies; index: snp IDs Examples: .. doctest:: >>> from pkg_resources import resource_filename >>> from limmbo.io import reader >>> from limmbo.io import input >>> from limmbo.utils.utils import makeHardCalledGenotypes >>> from limmbo.utils.utils import AlleleFrequencies >>> data = reader.ReadData(verbose=False) >>> file_geno = resource_filename('limmbo', ... 'io/test/data/genotypes.csv') >>> data.getGenotypes(file_genotypes=file_geno) >>> indata = input.InputData(verbose=False) >>> indata.addGenotypes(genotypes=data.genotypes, ... genotypes_info=data.genotypes_info, ... geno_samples=data.geno_samples) >>> freqs = indata.getAlleleFrequencies() >>> freqs.iloc[:5,:] p q rs1601111 0.292186 0.707814 rs13270638 0.303581 0.696419 rs75132935 0.024295 0.975705 rs72668606 0.119091 0.880909 rs55770986 0.169338 0.830662 """ verboseprint('Get allele frequencies of %s snps'.format( self.genotypes.shape[1]), verbose=self.verbose) self.freqs = np.zeros((self.genotypes.shape[1], 2)) for snp in range(self.genotypes.shape[1]): self.freqs[snp, 0], self.freqs[snp, 1] = AlleleFrequencies( np.array(self.genotypes)[:, snp]) self.freqs = pd.DataFrame(self.freqs, index=self.genotypes_info.index, columns=['p', 'q'])
return self.freqs @staticmethod def _is_positive_definite(matrix): try: chol_matrix = np.linalg.cholesky(matrix) return(True) except np.linalg.linalg.LinAlgError:
return(False)