Source code for limmbo.io.input

from limix.util.preprocess import regressOut

from limmbo.utils.utils import verboseprint
from limmbo.utils.utils import match
from limmbo.utils.utils import scale
from limmbo.utils.utils import makeHardCalledGenotypes
from limmbo.utils.utils import AlleleFrequencies

import pandas as pd
import numpy as np
import re

from scipy_sugar.stats import quantile_gaussianize
from math import sqrt

class MissingInput(Exception):
    """Raised when no appropriate input is given"""
    pass


class FormatError(Exception):
    """Raised when no appropriate input is given"""
    pass


class DataMismatch(Exception):
    """Raised when dimensions of sample/ID names do not match dimension of
    corresponding data"""
    pass


[docs]class InputData(object):
    """
    Generate object containing all datasets relevant for variance decomposition
    (phenotypes, relatedness estimates) and pre-processing steps (check for
    common samples and sample order, covariates regression and phenotype
    transformation)

    Arguments:
        verbose (bool):
            initialise verbose: should progress messages be printed to stdout
    """

    def __init__(self, verbose = True):
        self.verbose = verbose
        self.samples = None
        self.phenotypes = None
        self.pheno_samples = None
        self.phenotype_ID = None
        self.covariates = None
        self.covariate_samples = None
        self.relatedness = None
        self.relatedness_samples = None
        self.pcs = None
        self.pc_samples = None
        self.snps = None
        self.genotypes = None
        self.geno_samples = None
        self.genotypes_info = None
        self.Cg = None
        self.Cn = None

[docs]    def addPhenotypes(self, phenotypes, pheno_samples=None, phenotype_ID=None):
        """
        Add phenotypes, their phenotype ID and their sample IDs to
        InputData instance
        
        Arguments:
            phenotypes (array-like):
                [`N x `P`] phenotype matrix of `N` individuals and `P`
                phenotypes; if pandas.DataFrame with pheno_samples as index and 
                phenotypes_ID as columns, pheno_samples and phenotype_ID do not 
                have to specified separately.
            pheno_samples (array-like):
                [`N`] sample ID
            phenotype_ID (array-like):
                [`P`] phenotype IDs
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.phenotypes** (pd.DataFrame):
                  [`N` x `P`] phenotype array
                - **self.pheno_samples** (np.array):
                  [`N`] sample IDs
                - **self.phenotype_ID** (np.array):
                  [`P`] phenotype IDs
        
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> import pandas as pd
                >>> pheno = np.array(((1,2),(7,1),(3,4)))
                >>> pheno_samples = ['S1','S2', 'S3']
                >>> phenotype_ID = ['ID1','ID2']
                >>> phenotypes = pd.DataFrame(pheno, index=pheno_samples,
                ...     columns = phenotype_ID)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = phenotypes)
                >>> print indata.phenotypes.shape
                (3, 2)
                >>> print indata.pheno_samples.shape
                (3,)
                >>> print indata.phenotype_ID.shape
                (2,)
        """
        if pheno_samples is None:
            try:
                self.pheno_samples = np.array(phenotypes.index)
            except Exception:
                raise TypeError(("pheno_samples are not provided and "
                    "phenotypes has no index to retrieve pheno_samples from."))
        else:
            self.pheno_samples = np.array(pheno_samples)

        if phenotype_ID is None:
            try:
                self.phenotype_ID = np.array(phenotypes.columns)
            except Exception:
                raise TypeError(("phenotype_ID are not provided and phenotypes "
                    "has no column names to retrieve phenotype_ID from."))
        else:
            self.phenotype_ID = np.array(phenotype_ID)

        if phenotypes.shape[0] != self.pheno_samples.shape[0]:
            raise DataMismatch(('Number of samples in phenotypes ({}) does '
                'not match number of sample IDs ({}) provided').format(
                    phenotypes.shape[0], self.pheno_samples.shape[0]))
        if phenotypes.shape[1] != self.phenotype_ID.shape[0]:
            raise DataMismatch(('Number phenotypes ({}) does not match '
                'number of phenotype IDs ({}) provided').format(
                    phenotypes.shape[1], self.phenotype_ID.shape[0]))
        if len(self.pheno_samples) != len(set(self.pheno_samples)):
            raise IOError("Duplicate sample names in phenotypes")    
        if len(self.phenotype_ID) != len(set(self.phenotype_ID)):
            raise IOError("Duplicate trait names in phenotypes")    
        
        self.phenotypes = pd.DataFrame(phenotypes, index=self.pheno_samples,
            columns = self.phenotype_ID)

[docs]    def addCovariates(self, covariates, covs_samples = None):
        """
        Add [`N` x `K`] covariate data with [`N`] samples and [`K`] covariates
        to InputData instance.
        
        Arguments:
            covariates (array-like):
                [`N x `K`] covariate matrix of `N` individuals and `K`
                covariates; if pandas.DataFrame with covs_samples as index, 
                covs_samples do not have to specified separately.
            covs_samples (array-like):
                [`N`] sample ID
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.covariates** (pd.DataFrame):
                  [`N` x `K`] covariates matrix
                - **self.covs_samples** (np.array):
                  [`N`] sample IDs
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> import pandas as pd
                >>> covariates = [(1,2,4),(1,1,6),(0,4,8)]
                >>> covs_samples = ['S1','S2', 'S3']
                >>> covariates = pd.DataFrame(covariates, index=covs_samples)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addCovariates(covariates = covariates,
                ...     covs_samples = covs_samples)
                >>> print indata.covariates.shape
                (3, 3)
                >>> print indata.covs_samples.shape
                (3,)
        
        """
        if covs_samples is None:
            try:
                self.covs_samples = np.array(covariates.index)
            except Exception:
                raise TypeError(("covs_samples are not provided and "
                    "covariates has no index to retrieve covs_samples "
                    "from."))
        else:
            self.covs_samples = np.array(covs_samples)
        if np.array(covariates).shape[0] != np.array(self.covs_samples).shape[0]:
            raise DataMismatch(('Number of samples in covariates ({}) does '
                'not match number of sample IDs ({}) provided').format(
                    np.array(covariates).shape[0],
                    np.array(self.covs_samples).shape[0]))
        if len(self.covs_samples) != len(set(self.covs_samples)):
            raise IOError("Duplicate sample names in covariates")    
        self.covariates = pd.DataFrame(covariates, index=self.covs_samples)

[docs]    def addRelatedness(self, relatedness, relatedness_samples = None):
        """
        Add [`N` x `N`] pairwise relatedness estimates of [`N`] samples to the
        InputData instance
        
        Arguments:
            relatedness (array-like):
                [`N x `N`] relatedness matrix of `N` individuals;
                if pandas.DataFrame with relatedness_samples as index,
                relatedness_samples do not have to specified separately.
            relatedness_samples (array-like):
                [`N`] sample IDs
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.relatedness** (pd.DataFrame):
                  [`N` x `N`] relatedness matrix
                - **self.relatedness_samples** (np.array):
                  [`N`] sample IDs
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy
                >>> import pandas as pd
                >>> from numpy.random import RandomState
                >>> from numpy.linalg import cholesky as chol
                >>> random = RandomState(5)
                >>> N = 100
                >>> SNP = 1000
                >>> X = (random.rand(N, SNP) < 0.3).astype(float)
                >>> relatedness = numpy.dot(X, X.T)/float(SNP)
                >>> relatedness_samples = numpy.array(
                ...     ['S{}'.format(x+1) for x in range(N)])
                >>> relatedness = pd.DataFrame(relatedness,
                ...     index=relatedness_samples)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addRelatedness(relatedness = relatedness)
                >>> print indata.relatedness.shape
                (100, 100)
                >>> print indata.relatedness_samples.shape
                (100,)
        
        """
        if relatedness_samples is None:
            try:
                self.relatedness_samples = np.array(relatedness.index)
            except Exception:
                raise TypeError(("relatedness_samples are not provided and "
                    "relatedness has no index to retrieve relatedness_samples "
                    "from"))
        else:
            self.relatedness_samples = np.array(relatedness_samples)
        rel = np.array(relatedness)
        if rel.shape[0] != rel.shape[1]:
            raise FormatError(('Relatedness has to be a square matrix, but '
                'number of rows {} is not equal to number of columns '
                '{}').format(rel.shape[0], rel.shape[1]))

        if not np.all(np.array(rel) - np.array(rel).T == 0):
            raise FormatError('Relatedness matrix is not symmetric')
        if not self._is_positive_definite(rel):
            raise FormatError('Relatedness matrix is not positive-semi definite')
        if rel.shape[0] != self.relatedness_samples.shape[0]:
            raise DataMismatch(('Number of samples in relatedness ({}) does '
                    'not match number of sample IDs ({}) provided').format(
                    rel.shape[0], self.relatedness_samples.shape[0]))
        if len(self.relatedness_samples) != len(set(self.relatedness_samples)):
            raise IOError("Duplicate sample names in relatedness")    
        self.relatedness = pd.DataFrame(relatedness,
            index=self.relatedness_samples, columns=self.relatedness_samples)

[docs]    def addGenotypes(self, genotypes, geno_samples = None, 
            genotypes_info = None):
        """
        Add [`N` x `NrSNP`] genotype array of [`N`] samples and [`NrSNP`]
        genotypes, [`N`] array of sample IDs and [`NrSNP` x 2] dataframe of
        genotype description to InputData instance.
        
        Arguments:
            genotypes (array-like):
                [`N` x `NrSNP`] genotype array of [`N`] samples and [`NrSNP`]
                genotypes; if pandas.DataFrame with geno_samples as index,
                geno_samples do not have to specified separately.
            geno_samples (array-like):
                [`N`] vector of `N` sample IDs
            genotypes_info (dataframe):
                  [`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and
                  rsIDs as index
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.genotypes** (pd.DataFrame):
                  [`N` x `NrSNPs`] genotype matrix
                - **self.geno_samples** (np.array):
                  [`N`] sample IDs
                - **self.genotypes_info** (pd.DataFrame):
                  [`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and
                  rsIDs as index
        
        Examples:
        
            .. doctest::
        
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io import input
                >>> data = reader.ReadData(verbose=False)
                >>> file_geno = resource_filename('limmbo',
                ...                                'io/test/data/genotypes.csv')
                >>> data.getGenotypes(file_genotypes=file_geno)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addGenotypes(genotypes=data.genotypes,
                ...                     genotypes_info=data.genotypes_info)
                >>> indata.geno_samples[:5]
                array(['ID_1', 'ID_2', 'ID_3', 'ID_4', 'ID_5'], dtype=object)
                >>> indata.genotypes.shape
                (1000, 20)
                >>> indata.genotypes.values[:5,:5]
                array([[0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [2., 1., 0., 0., 0.],
                       [1., 0., 0., 0., 0.]])
                >>> indata.genotypes_info[:5]
                           chrom       pos
                rs1601111      3  88905003
                rs13270638     8  20286021
                rs75132935     8  76564608
                rs72668606     8  79733124
                rs55770986     7   2087823
        """
        if geno_samples is None:
            try:
                self.geno_samples = np.array(genotypes.index)
            except Exception:
                raise TypeError(("geno_samples are not provided and genotypes "
                    "has no index to retrieve geno_samples from"))
        else:
            self.geno_samples = np.array(geno_samples)
        if genotypes_info is None:
            raise MissingInput(('Genotype info has to be specified via '
                    'genotypes_info'))
        self.genotypes = pd.DataFrame(genotypes, index=self.geno_samples)
        self.genotypes_info = genotypes_info
        if self.genotypes.shape[0] != self.geno_samples.shape[0]:
            raise DataMismatch(('Number of samples in genotypes ({}) does '
                'not match number of sample IDs ({}) provided').format(
                    self.genotypes.shape[0], self.geno_samples.shape[0]))
        if self.genotypes.shape[1] != self.genotypes_info.shape[0]:
            raise DataMismatch(('Number of genotypes in genotypes ({}) does '
                'not match number of genotypes in genotypes_info ({})').format(
                    self.genotypes.shape[1], self.genotypes_info.shape[0]))
        if len(self.geno_samples) != len(set(self.geno_samples)):
            raise IOError("Duplicate sample names in genotypes")    

[docs]    def addVarianceComponents(self, Cg, Cn,):
        """
        Add [`P` x `P`] matrices of [`P`] trait covariance estimates
        of the genetic trait variance component (Cg) and the non-genetic (noise)
        variance component (Cn) to InputData instance
        
        Arguments:
            Cg (array-like):
                [`P x `P`] matrix of `P` trait covariance estimates of the
                genetic trait covaraince component
            Cn (array-like):
                [`P x `P`] matrix of `P` trait covariance estimates of the
                non-genetic (noise) trait covaraince component
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.Cg** (np.array):
                  [`P x `P`] matrix of `P` trait covariance estimates of the
                  genetic trait covariance component
                - **self.Cn** (np.array):
                  [`P x `P`] matrix of `P` trait covariance estimates of the
                  non-genetic trait covaraince component
        
        Examples:
        
            .. doctest::
        
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> from numpy.random import RandomState
                >>> from numpy.linalg import cholesky as chol
                >>> data = reader.ReadData(verbose=False)
                >>> file_pheno = resource_filename('limmbo',
                ...                     'io/test/data/pheno.csv')
                >>> data.getPhenotypes(file_pheno=file_pheno)
                >>> file_Cg = resource_filename('limmbo',
                ...                     'io/test/data/Cg.csv')
                >>> file_Cn = resource_filename('limmbo',
                ...                     'io/test/data/Cn.csv')
                >>> data.getVarianceComponents(file_Cg=file_Cg,
                ...                            file_Cn=file_Cn)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = data.phenotypes)
                >>> indata.addVarianceComponents(Cg = data.Cg, Cn=data.Cn)
                >>> print indata.Cg.shape
                (10, 10)
                >>> print indata.Cg.shape
                (10, 10)
        """
        if Cg is not None and Cn is not None:
            if self.phenotypes is None:
                raise FormatError(('Phenotypes have to be added before Cg/Cn '
                        'can be added'))
            self.Cg = np.array(Cg)
            self.Cn = np.array(Cn)
            if self.Cg.shape[0] != self.Cg.shape[1]:
                raise FormatError(('Cg has to be a square matrix, but '
                    'number of rows {} is not equal to number of columns '
                    '{}').format(self.Cg.shape[0], self.Cg.shape[1]))
            if not np.all(self.Cg - self.Cg.T == 0):
                raise FormatError('Cg is not symmetric')
            if not self._is_positive_definite(self.Cg):
                raise FormatError('Cg is not positive-semi definite')
            if self.Cg.shape[0] != self.phenotypes.shape[1]:
                raise DataMismatch(('Number of traits in Cg ({}) does '
                    'not match number of traits ({}) in phenotypes').format(
                        self.Cg.shape[0], self.phenotypes.shape[1]))
            if self.Cn.shape[0] != self.Cn.shape[1]:
                raise FormatError(('Cn has to be a square matrix, but '
                    'number of rows {} is not equal to number of columns '
                    '{}').format(self.Cn.shape[0], self.Cn.shape[1]))

            if not np.all(self.Cn - self.Cn.T == 0):
                raise FormatError('Cn is not symmetric')
            if not self._is_positive_definite(self.Cn):
                raise FormatError('Cn is not positive-semi definite')
            if self.Cn.shape[0] != self.phenotypes.shape[1]:
                raise DataMismatch(('Number of traits in Cn ({}) does '
                    'not match number of traits ({}) in phenotypes').format(
                        self.Cn.shape[0], self.phenotypes.shape[1]))

[docs]    def addPCs(self, pcs, pc_samples = None):
        """
        Add [`N` x `PC`] matrix of [`PC`] principal components from the
        genotypes of [`N`] samples to InputData instance.
        
        Arguments:
            pcs (array-like):
                [`N x `PCs`] principal component matrix of `N` individuals and
                `PCs` principal components; if pandas.DataFrame with pc_samples
                as index, covs_samples do not have to specified separately.
            pc_samples (array-like):
                [`N`] sample IDs
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.pcs** (pd.DataFrame):
                  [`N` x `PCs`] principal component matrix
                - **self.pc_samples** (np.array):
                  [`N`] sample IDs
        
        Examples:
        
            .. doctest::
        
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io import input
                >>> data = reader.ReadData(verbose=False)
                >>> file_pcs = resource_filename('limmbo',
                ...                     'io/test/data/pcs.csv')
                >>> data.getPCs(file_pcs=file_pcs, nrpcs=10, delim=" ")
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPCs(pcs = data.pcs)
                >>> print indata.pcs.shape
                (1000, 10)
                >>> print indata.pc_samples.shape
                (1000,)
        
        """
        if pc_samples is None:
            try:
                self.pc_samples = np.array(pcs.index)
            except Exception:
                raise TypeError(("pc_samples are not provided and pcs has "
                    "no index to retrieve pc_samples from"))
        else:
            self.pc_samples = np.array(pc_samples)
        if np.array(pcs).shape[0] != np.array(self.pc_samples).shape[0]:
            raise DataMismatch(('Number of samples in pcs ({}) does'
                    'not match number of sample IDs ({}) provided').format(
                        np.array(pcs).shape[0], 
                        np.array(pc_samples).shape[0]))
        if len(self.pc_samples) != len(set(self.pc_samples)):
            raise IOError("Duplicate sample names for principle components")    
        self.pcs = pd.DataFrame(pcs, index=self.pc_samples)


[docs]    def subsetTraits(self, traitlist = None):
        """
        Limit analysis to specific subset of traits
        
        Arguments:
            traitlist (array-like):
                array of trait numbers to select from phenotypes
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.traitlist** (list):
                  of [`t`] trait numbers (int) to choose for analysis
                - **self.phenotypes** (pd.DataFrame):
                  reduced set of [`N` x `t`] phenotypes
                - **self.phenotype.ID** (np.array):
                  reduced set of [`t`] phenotype IDs
        
        Examples:

            .. doctest::
            
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> from limmbo.io.input import InputData
                >>> from limmbo.io.utils import file_type
                >>> data = ReadData(verbose=False)
                >>> file_pheno = resource_filename('limmbo',
                ...                                'io/test/data/pheno.csv')
                >>> data.getPhenotypes(file_pheno=file_pheno)
                >>> traitlist = data.getTraitSubset(traitstring="1-3,5")
                >>> indata = InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = data.phenotypes)
                >>> print indata.phenotypes.shape
                (1000, 10)
                >>> print indata.phenotype_ID.shape
                (10,)
                >>> indata.subsetTraits(traitlist=traitlist)
                >>> print indata.phenotypes.shape
                (1000, 4)
                >>> print indata.phenotype_ID.shape
                (4,)
        """
        self.traitlist = np.array(traitlist)
        if len(self.traitlist) != len(set(self.traitlist)):
            raise IOError("Duplicate trait names in traitlist")    
        try:
            self.phenotypes = self.phenotypes.iloc[:, self.traitlist]
            self.phenotype_ID = self.phenotype_ID[self.traitlist]
        except:
            raise DataMismatch(('Selected trait number {} is greater '
                    'than number of phenotypes provided {}').format(
                        max(self.traitlist) + 1, 
                        self.phenotypes.shape[1]))

[docs]    def commonSamples(self, samplelist=None):
        """
        Get [`M]` common samples out of phenotype, relatedness and optional
        covariates with [`N`] samples (if all samples present in all datasets
        [`M`] = [`N`]) and ensure that samples are in same order.
        
        Arguments:
            samplelist (array-like):
                array of sample IDs to select from data
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.phenotypes** (pd.DataFrame):
                  [`M` x `P`] phenotype matrix
                - **self.pheno_samples** (np.array):
                  [`M`] sample IDs
                - **self.relatedness** (pd.DataFrame):
                  [`M x M`] relatedness matrix
                - **self.relatedness_samples** (np.array):
                  [`M`] sample IDs of relatedness matrix
                - **self.covariates** (pd.DataFrame):
                  [`M` x `K`] covariates matrix
                - **self.covs_samples** (np.array):
                  [`M`] sample IDs
                - **self.genotypes** (pd.DataFrame):
                  [`M` x `NrSNPs`] genotypes matrix
                - **self.geno_samples** (np.array):
                  [`M`] sample IDs
                - **self.pcs** (pd.DataFrame):
                  [`M` x `PCs`] principal component matrix
                - **self.pc_samples** (np.array):
                  [`M`] sample IDs
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> import pandas as pd
                >>> from numpy.random import RandomState
                >>> from numpy.linalg import cholesky as chol
                >>> random = RandomState(5)
                >>> P = 2
                >>> K = 4
                >>> N = 10
                >>> SNP = 1000
                >>> pheno = random.normal(0,1, (N, P))
                >>> pheno_samples = np.array(['S{}'.format(x+4)
                ...     for x in range(N)])
                >>> phenotype_ID = np.array(['ID{}'.format(x+1)
                ...     for x in range(P)])
                >>> phenotypes = pd.DataFrame(pheno, index=pheno_samples,
                ...     columns=phenotype_ID)
                >>> X = (random.rand(N, SNP) < 0.3).astype(float)
                >>> relatedness = np.dot(X, X.T)/float(SNP)
                >>> relatedness_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N)])
                >>> covariates = random.normal(0,1, (N-2, K))
                >>> covs_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N-2)])
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = pheno,
                ...                      pheno_samples = pheno_samples,
                ...                      phenotype_ID = phenotype_ID)
                >>> indata.addRelatedness(relatedness = relatedness,
                ...                  relatedness_samples = relatedness_samples)
                >>> indata.addCovariates(covariates = covariates,
                ...                      covs_samples = covs_samples)
                >>> indata.covariates.shape
                (8, 4)
                >>> indata.phenotypes.shape
                (10, 2)
                >>> indata.relatedness.shape
                (10, 10)
                >>> indata.commonSamples(samplelist=["S4", "S6", "S5"])
                >>> indata.covariates.shape
                (3, 4)
                >>> indata.phenotypes.shape
                (3, 2)
                >>> indata.relatedness.shape
                (3, 3)
        """
        self.samples = self.pheno_samples

        if self.relatedness is not None:
            test_pheno_relatedness = np.intersect1d(self.pheno_samples, 
                    self.relatedness_samples)
            if len(test_pheno_relatedness) == 0:
                raise DataMismatch(('No common samples between phenotypes and '
                        'relatedness estimates'))
            self.samples = test_pheno_relatedness
        if self.genotypes is not None:
            test_pheno_geno = np.intersect1d(self.pheno_samples, 
                    self.geno_samples)
            if len(test_pheno_geno) == 0:
                raise DataMismatch(('No common samples between phenotypes,'
                        'and genotypes'))
            self.samples = np.intersect1d(self.samples, test_pheno_geno)
        if self.covariates is not None:
            test_pheno_covs = np.intersect1d(self.pheno_samples, 
                    self.covs_samples)
            if len(test_pheno_covs) == 0:
                raise DataMismatch(('No common samples between phenotypes, '
                        'and covariates'))
            self.samples = np.intersect1d(self.samples, test_pheno_covs)
        if self.pcs is not None:
            test_pheno_pcs = np.intersect1d(self.pheno_samples, 
                    self.pcs_samples)
            if len(test_pheno_pcs) == 0:
                raise DataMismatch(('No common samples between phenotypes,'
                        'and pcs'))
            self.samples = np.intersect1d(self.samples, test_pheno_pcs)

        if samplelist is not None:
            if len(samplelist) != len(set(samplelist)):
                raise IOError("Duplicate sample names in samplelist")
            test_samples_samplelist = np.intersect1d(self.samples, samplelist)
            if len(test_samples_samplelist) == 0:
                raise DataMismatch(('No samples between common samples in, '
                        'datasets and samplelist'))
            if len(test_samples_samplelist) < len(samplelist):
                raise DataMismatch(('Not all Ids in samplelist are contained '
                    'in common samples from provided datasets'))
            self.samples = samplelist
    
        self.phenotypes = self.phenotypes.loc[self.samples,:]
        self.pheno_samples = np.array(self.phenotypes.index)

        if self.genotypes is not None:
            self.genotypes = self.genotypes.loc[self.samples,:]
            self.geno_samples = np.array(self.genotypes.index)
        if self.relatedness is not None:
            self.relatedness = self.relatedness.loc[self.samples,:]
            self.relatedness = self.relatedness[self.samples]
            self.relatedness_samples = np.array(self.relatedness.index)
        if self.covariates is not None:
            self.covariates = self.covariates.loc[self.samples,:]
            self.covs_samples = np.array(self.covariates.index)
        if self.pcs is not None:
            self.pcs = self.pcs.loc[self.samples,:]
            self.pc_samples = np.array(self.pcs.index)

[docs]    def regress(self):
        """
        Regress out covariates (optional).
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.phenotypes** (np.array):
                  [`M` x `P`] phenotype matrix of residuals of linear model
                - **self.covariates**:
                  None
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> from numpy.random import RandomState
                >>> from numpy.linalg import cholesky as chol
                >>> random = RandomState(5)
                >>> P = 5
                >>> K = 4
                >>> N = 100
                >>> pheno = random.normal(0,1, (N, P))
                >>> pheno_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N)])
                >>> phenotype_ID = np.array(['ID{}'.format(x+1)
                ...     for x in range(P)])
                >>> covariates = random.normal(0,1, (N, K))
                >>> covs_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N)])
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = pheno,
                ...                      pheno_samples = pheno_samples,
                ...                      phenotype_ID = phenotype_ID)
                >>> indata.addCovariates(covariates = covariates,
                ...                      covs_samples = covs_samples)
                >>> indata.phenotypes.values[:3, :3]
                array([[ 0.44122749, -0.33087015,  2.43077119],
                       [ 1.58248112, -0.9092324 , -0.59163666],
                       [-1.19276461, -0.20487651, -0.35882895]])
                >>> indata.regress()
                >>> indata.phenotypes.values[:3, :3]
                array([[ 0.34421705, -0.01470998,  2.25710966],
                       [ 1.69886647, -1.41756814, -0.55614649],
                       [-1.10700674, -0.66017713, -0.22201814]])
        """
        if np.array_equal(self.phenotypes, self.covariates):
            raise DataMismatch(('Phenotype and covariate arrays are '
                    'identical'))
        verboseprint('Regress covariates', verbose=self.verbose)
        phenotypes = regressOut(np.array(self.phenotypes),
                np.array(self.covariates))
        self.phenotypes = pd.DataFrame(phenotypes,
            index=self.phenotypes.index, columns=self.phenotypes.columns)
        self.covariates = None

[docs]    def transform(self, transform):
        """
        Transform phenotypes
        
        Arguments:
            transform (string):
                transformation method for phenotype data:
        
                    - scale:
                      mean center, divide by sd
                    - gaussian:
                      inverse normalisation
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.phenotypes** (np.array):
                  [`N` x `P`] (transformed) phenotype matrix
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import input
                >>> import numpy as np
                >>> from numpy.random import RandomState
                >>> from numpy.linalg import cholesky as chol
                >>> random = RandomState(5)
                >>> P = 5
                >>> K = 4
                >>> N = 100
                >>> pheno = random.normal(0,1, (N, P))
                >>> pheno_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N)])
                >>> phenotype_ID = np.array(['ID{}'.format(x+1)
                ...     for x in range(P)])
                >>> SNP = 1000
                >>> X = (random.rand(N, SNP) < 0.3).astype(float)
                >>> relatedness = np.dot(X, X.T)/float(SNP)
                >>> relatedness_samples = np.array(['S{}'.format(x+1)
                ...     for x in range(N)])
                >>> indata = input.InputData(verbose=False)
                >>> indata.addPhenotypes(phenotypes = pheno,
                ...                      pheno_samples = pheno_samples,
                ...                      phenotype_ID = phenotype_ID)
                >>> indata.addRelatedness(relatedness = relatedness,
                ...                  relatedness_samples = relatedness_samples)
                >>> indata.phenotypes.values[:3, :3]
                array([[ 0.44122749, -0.33087015,  2.43077119],
                       [ 1.58248112, -0.9092324 , -0.59163666],
                       [-1.19276461, -0.20487651, -0.35882895]])
                >>> indata.transform(transform='gaussian')
                >>> indata.phenotypes.values[:3, :3]
                array([[ 0.23799988, -0.11191464,  2.05785598],
                       [ 1.41041953, -0.81365681, -0.92217818],
                       [-1.55977999,  0.01240937, -0.62091817]])
        """
        if transform == 'scale':
            verboseprint('Use %s as transformation' % transform, 
                verbose=self.verbose)
            phenotypes = scale(self.phenotypes)
            self.phenotypes = pd.DataFrame(phenotypes,
                index=self.phenotypes.index, columns=self.phenotypes.columns)
        elif transform == 'gaussian':
            verboseprint('Use %s as transformation' % transform, 
                verbose=self.verbose)
            phenotypes = np.apply_along_axis(quantile_gaussianize, 0, 
                    self.phenotypes)
            self.phenotypes = pd.DataFrame(phenotypes,
                index=self.phenotypes.index, columns=self.phenotypes.columns)
        else:
            raise TypeError(('Possible transformation methods are: scale, '
                    'and gaussian but {} provided').format(transform))

[docs]    def standardiseGenotypes(self):
        r"""
        Standardise genotypes:

        .. math::
           w_{ij} = \frac{x_{ij} -2p_i}{\sqrt{2p_i (1-p_i)}} 
        
        where :math:`x_{ij}` is the number of copies of the reference allele for 
        the :math:`i` th SNP of the :math:`j` th individual and :math:`p_i` is 
        the frequency of the reference allele (as described in `(Yang et al
        2011)
        <http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3014363&tool=pmcentrez&rendertype=abstract>`_).
        
        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.genotypes_sd** (numpy array):
                  [`N` x `NrSNP`] matrix of `NrSNP` standardised genotypes for
                  `N` samples.
        
        Examples:
        
            .. doctest::
        
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io import input
                >>> from limmbo.utils.utils import makeHardCalledGenotypes
                >>> from limmbo.utils.utils import AlleleFrequencies
                >>> data = reader.ReadData(verbose=False)
                >>> file_geno = resource_filename('limmbo',
                ...                                'io/test/data/genotypes.csv')
                >>> data.getGenotypes(file_genotypes=file_geno)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addGenotypes(genotypes=data.genotypes,
                ...                     genotypes_info=data.genotypes_info)
                >>> geno_sd = indata.standardiseGenotypes()
                >>> geno_sd.iloc[:5,:3]
                             0         1       2
                ID_1 -2.201123 -2.141970 -8.9622
                ID_2 -2.201123 -2.141970 -8.9622
                ID_3 -2.201123 -2.141970 -8.9622
                ID_4  0.908627 -0.604125 -8.9622
                ID_5 -0.646248 -2.141970 -8.9622
        """
        self.genotypes_sd = np.zeros(self.genotypes.shape)
        for snp in range(self.genotypes.shape[1]):
            p, q = AlleleFrequencies(self.genotypes.iloc[:, snp])
            var_snp = sqrt(2 * p * q)
            for n in range(self.genotypes.iloc[:, snp].shape[0]):
                self.genotypes_sd[n, snp] = (np.array(self.genotypes)[n, snp] -
                    2 * q) / var_snp

        self.genotypes_sd = pd.DataFrame(self.genotypes_sd,
                index=self.genotypes.index)

        return self.genotypes_sd

[docs]    def getAlleleFrequencies(self):
        """
        Compute allele frequencies of genotypes.

        Returns:
            None:
                updated the following attributes of the InputData instance:
        
                - **self.freqs** (pandas DataFrame):
                  [`NrSNP` x `2`] matrix of alt and ref allele frequencies; 
                  index: snp IDs
        
        Examples:
        
            .. doctest::
        
                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io import input
                >>> from limmbo.utils.utils import makeHardCalledGenotypes
                >>> from limmbo.utils.utils import AlleleFrequencies
                >>> data = reader.ReadData(verbose=False)
                >>> file_geno = resource_filename('limmbo',
                ...                                'io/test/data/genotypes.csv')
                >>> data.getGenotypes(file_genotypes=file_geno)
                >>> indata = input.InputData(verbose=False)
                >>> indata.addGenotypes(genotypes=data.genotypes,
                ...                     genotypes_info=data.genotypes_info,
                ...                     geno_samples=data.geno_samples)
                >>> freqs = indata.getAlleleFrequencies()
                >>> freqs.iloc[:5,:]
                                   p         q
                rs1601111   0.292186  0.707814
                rs13270638  0.303581  0.696419
                rs75132935  0.024295  0.975705
                rs72668606  0.119091  0.880909
                rs55770986  0.169338  0.830662
        """
        verboseprint('Get allele frequencies of %s snps'.format(
            self.genotypes.shape[1]), verbose=self.verbose)
        self.freqs = np.zeros((self.genotypes.shape[1], 2))
        for snp in range(self.genotypes.shape[1]):
            self.freqs[snp, 0], self.freqs[snp, 1] = AlleleFrequencies(
                np.array(self.genotypes)[:, snp])

        self.freqs = pd.DataFrame(self.freqs, index=self.genotypes_info.index, 
                columns=['p', 'q'])
        return self.freqs

    @staticmethod
    def _is_positive_definite(matrix):
        try: 
            chol_matrix = np.linalg.cholesky(matrix)
            return(True)
        except np.linalg.linalg.LinAlgError:
            return(False)