Source code for limmbo.io.reader

import scipy as sp
import pandas as pd
import numpy as np
import re

from limix.io import read_plink

from limmbo.utils.utils import verboseprint
from limmbo.io.utils import file_type

class MissingInput(Exception):
    """Raised when no appropriate input is given"""
    pass


class FormatError(Exception):
    """Raised when no appropriate input is given"""
    pass


class DataMismatch(Exception):
    """Raised when dimensions of sample/ID names do not match dimension of
     corresponding data"""
    pass


[docs]class ReadData(object):
    r"""
    Generate object containing all datasets relevant for the analysis.
    For variance decomposition, at least phenotypes and relatedness estimates
    need to be specified.
    For association testing with LMM, at least phenotype, relatedness estimates
    and genotypes need to be read.
    """

    def __init__(self, verbose=True):
        self.verbose = verbose
        self.samples = None
        self.phenotypes = None
        self.pheno_samples = None
        self.phenotype_ID = None
        self.covariates = None
        self.covs_samples = None
        self.relatedness = None
        self.relatedness_samples = None
        self.genotypes = None
        self.geno_samples = None
        self.genotypes_info = None
        self.pcs = None
        self.pc_samples = None
        self.snps = None
        self.geno_samples = None
        self.position = None
        self.Cg = None
        self.Cn = None

[docs]    def getPhenotypes(self, file_pheno=None, delim=","):
        r"""
        Reads [`N` x `P`] phenotype file; file ending must be either .txt or 
        .csv

        Arguments:
            file_pheno (string):
                path to [(`N`+1) x (`P`+1)] phenotype file with: [`N`] sample 
                IDs in the first column and [`P`] phenotype IDs in the first row
            delim (string): 
                delimiter of phenotype file, one of " ", ",", "\t"

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.phenotypes** (np.array):
                  [`N` x `P`] phenotype matrix

        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> from limmbo.io.utils import file_type
                >>> data = ReadData(verbose=False)
                >>> file_pheno = resource_filename('limmbo',
                ...                                'io/test/data/pheno.csv')
                >>> data.getPhenotypes(file_pheno=file_pheno)
                >>> data.phenotypes.index[:3]
                Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
                >>> data.phenotypes.columns[:3]
                Index([u'trait_1', u'trait_2', u'trait_3'], dtype='object')
                >>> data.phenotypes.values[:3,:3]
                array([[-1.56760036, -1.5324513 ,  1.17789321],
                       [-0.85655034,  0.48358151,  1.35664966],
                       [ 0.10772832, -0.02262884, -0.27963328]])
        """

        if file_pheno is None:
            raise MissingInput('No phenotype file specified')
        else:
            if file_type(file_pheno) is not "delim": 
                raise FormatError('Supplied phenotype file is neither .csv ' 
                        'nor .txt')
            verboseprint(
                "Reading phenotypes from %s" % file_pheno, verbose=self.verbose)
            try:
                self.phenotypes = pd.io.parsers.read_csv(
                    file_pheno, index_col=0, sep=delim)
            except Exception:
                raise IOError('{} could not be opened'.format(file_pheno))

[docs]    def getCovariates(self, file_covariates=None, delim=','):
        r"""
        Reads [`N` x `K`] covariate matrix with [`N`] samples and [`K`] 
        covariates.

        Arguments:
            file_covariates (string):
                [`N` x (`K` +1)] covariates file with [`N`] sample IDs in the 
                first column
            delim (string): 
                delimiter of covariates file, one of " ", ",", "\t"

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.covariates** (np.array):
                  [`N` x `K`] covariates matrix

        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> from limmbo.io.utils import file_type
                >>> data = ReadData(verbose=False)
                >>> file_covs = resource_filename('limmbo',
                ...                               'io/test/data/covs.csv')
                >>> data.getCovariates(file_covariates=file_covs)
                >>> data.covariates.index[:3]
                Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
                >>> data.covariates.values[:3,:3]
                array([[ 0.92734699,  1.59767659, -0.67263682],
                       [ 0.57061985, -0.84679736, -1.11037123],
                       [ 0.44201204, -1.61499228,  0.23302345]])
        """

        if file_covariates is not None:
            if file_type(file_covariates) is not 'delim':
                raise FormatError('Supplied covariate file is not .csv or .txt')
            try:
                self.covariates = pd.io.parsers.read_csv(file_covariates, 
                        sep=delim, index_col=0)
                verboseprint("Reading covariates file", verbose=self.verbose)
            except Exception:
                raise IOError('{} could not be opened'.format(file_covariates))
            # append column of 1's to adjust for mean of covariates
            self.covariates = pd.concat([self.covariates, pd.DataFrame(sp.ones(
                self.covariates.shape[0]), index=self.covariates.index)],
                 axis=1)
        else:
            verboseprint("No covariates set", verbose=self.verbose)
            self.covariates = None

[docs]    def getRelatedness(self, file_relatedness, delim=","):
        """
        Read file of [`N` x `N`] pairwise relatedness estimates of [`N`] 
        samples.

        Arguments:
            file_relatedness (string):
                [(`N` + `1`) x N] .csv file with: [`N`] sample IDs in the first
                row
            delim (string): 
                delimiter of covariate file, one of " ", ",", "\t"

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.relatedness** (np.array):
                  [`N` x `N`] relatedness matrix
        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> from limmbo.io.utils import file_type
                >>> data = ReadData(verbose=False)
                >>> file_relatedness = resource_filename('limmbo',
                ...                     'io/test/data/relatedness.csv')
                >>> data.getRelatedness(file_relatedness=file_relatedness)
                >>> data.relatedness.index[:3]
                Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
                >>> data.relatedness.columns[:3]
                Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
                >>> data.relatedness.values[:3,:3]
                array([[1.00892922e+00, 2.00758504e-04, 4.30499103e-03],
                       [2.00758504e-04, 9.98944885e-01, 4.86487318e-03],
                       [4.30499103e-03, 4.86487318e-03, 9.85787665e-01]])
        """

        if file_relatedness is None:
            raise MissingInput('No relatedness data specified')
        if file_type(file_relatedness) is not 'delim':
            raise FormatError('Supplied relatedness file is not .csv or .txt')
        try:
            self.relatedness = pd.io.parsers.read_csv(file_relatedness, 
                    sep=delim)
            self.relatedness.index = self.relatedness.columns
        except Exception:
            raise IOError('{} could not be opened'.format(file_relatedness))
        verboseprint("Reading relationship matrix", verbose=self.verbose)

[docs]    def getPCs(self, file_pcs=None, nrpcs=None, delim=","):
        r"""
        Reads file with [`N` x `PC`] matrix of [`PC`] principal components from 
        the genotypes of [`N`] samples.

        Arguments:
            file_pcs (string):
                [`N` x (`PC` +1)] PCA file with [`N`] sample IDs in the first
                column
            delim (string): 
                delimiter of PCA file, one of " ", ",", "\t"
            nrpcs (integer):
                Number of PCs to use (uses first nrpcs  principal components)

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.pcs** (np.array):
                  [`N` x `PC`] principal component matrix

        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> from limmbo.io.utils import file_type
                >>> data = ReadData(verbose=False)
                >>> file_pcs = resource_filename('limmbo',
                ...                     'io/test/data/pcs.csv')
                >>> data.getPCs(file_pcs=file_pcs, nrpcs=10, delim=" ")
                >>> data.pcs.index[:3]
                Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object', name=0)
                >>> data.pcs.values[:3,:3]
                array([[-0.02632738, -0.05791269, -0.03619099],
                       [ 0.00761785,  0.00538956,  0.00624196],
                       [ 0.01069307, -0.0205066 ,  0.02299996]])
        """

        if file_type(file_pcs) is not 'delim':
            raise FormatError('Supplied PCA file is not .csv or .txt')

        if file_pcs is not None:
            verboseprint("Reading PCs", verbose=self.verbose)
            self.pcs = pd.io.parsers.read_csv(file_pcs, index_col=0, 
                            sep=delim, header=None)
            if nrpcs is not None:
                verboseprint(
                    "Extracting first %s pcs" % nrpcs, verbose=self.verbose)
                self.pcs = self.pcs.iloc[:, :nrpcs]
        else:
            verboseprint("No pcs set", verbose=self.verbose)
            self.pcs = None

[docs]    def getGenotypes(self, file_genotypes=None, delim = ','):
        r"""
        Reads genotype file in the following formats: plink (.bed, .bim, .fam),  
        gen (.gen, .sample) or comma-separated values (.csv) file.

        Arguments:
            file_geno (string):
                path to phenotype file in .plink or .csv format
                - **plink format**:
                  
                  as specified in the plink `user manual <https://www.cog-genomics.org/plink/1.9/input>`_, 
                  binary plink format with .bed, .fam and .bim file

                - **.csv format**:

                  - [(`NrSNP` + 1) x (`N`+1)] .csv file with: [`N`] sample IDs
                    in the first row and [`NrSNP`] genotype IDs in the first
                    column
                  - sample IDs should be of type: chrom-pos-rsID for instance
                    22-50714616-rs6010226
            delim (string): 
                delimiter of genotype file (when text format), one of " ", ",", 
                "\t"

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.genotypes** (np.array):
                  [`N` x `NrSNPs`] genotype matrix
                - **self.genotypes_info** (pd.dataframe):
                  [`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and
                  rsIDs as index

        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io import reader
                >>> from limmbo.io.utils import file_type
                >>> data = reader.ReadData(verbose=False)
                >>> # Read genotypes in delim-format
                >>> file_geno = resource_filename('limmbo',
                ...     'io/test/data/genotypes.csv')
                >>> data.getGenotypes(file_genotypes=file_geno)
                >>> data.genotypes.index[:4]
                Index([u'ID_1', u'ID_2', u'ID_3', u'ID_4'], dtype='object')
                >>> data.genotypes.shape
                (1000, 20)
                >>> data.genotypes.values[:5,:5]
                array([[0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [2., 1., 0., 0., 0.],
                       [1., 0., 0., 0., 0.]])
                >>> data.genotypes_info[:5]
                           chrom       pos
                rs1601111      3  88905003
                rs13270638     8  20286021
                rs75132935     8  76564608
                rs72668606     8  79733124
                rs55770986     7   2087823
                >>> ### read genotypes in plink format
                >>> file_geno = resource_filename('limmbo',
                ...     'io/test/data/genotypes')
                >>> data.getGenotypes(file_genotypes=file_geno)
                >>> data.genotypes.values[:5,:5]
                array([[0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [0., 0., 0., 0., 0.],
                       [2., 1., 0., 0., 0.],
                       [1., 0., 0., 0., 0.]])
                >>> data.genotypes_info[:5]
                           chrom       pos
                rs1601111      3  88905003
                rs13270638     8  20286021
                rs75132935     8  76564608
                rs72668606     8  79733124
                rs55770986     7   2087823
        """

        if file_genotypes is None:
            raise MissingInput('No genotypes data specified')
        if file_type(file_genotypes) not in ['delim', 'bed']:
            raise FormatError(('Supplied genotype file is neither in plink nor '
                    '.csv/.txt format'))
        verboseprint("Reading genotypes from %s" % file_genotypes, 
                verbose=self.verbose)
        
        
        if file_type(file_genotypes) is 'delim':
            try:
                genotypes = pd.io.parsers.read_csv(
                    file_genotypes, sep=delim, index_col=0, header=0)
            except Exception:
                raise IOError('{} could not be opened'.format(file_genotypes))

            info = np.array(genotypes.index)

            genotypes_info = []
            snp_ID = []
            for id in range(info.shape[0]):
                split = np.array(info[id].split('-'))
                snp_ID.append(split[2])
                genotypes_info.append(split[[0, 1]])

            self.genotypes = genotypes.astype(float).T
            self.genotypes_info = pd.DataFrame(
                np.array(genotypes_info),
                columns=['chrom', 'pos'],
                index=snp_ID)
        
        if file_type(file_genotypes) is 'bed':
            try:
                (bim, fam, bed) = read_plink(file_genotypes,
                        verbose=self.verbose)
            except Exception:
                raise IOError('{} could not be opened'.format(file_genotypes))
            
            self.genotypes = pd.DataFrame(bed.compute()).astype(float).T
            self.genotypes.index = fam.iid
            self.genotypes_info = pd.DataFrame(
                np.array([bim.chrom, bim.pos]).T,
                columns=['chrom', 'pos'],
                index=bim.snp)
            self.genotypes_info.index.name = None

[docs]    def getVarianceComponents(self, file_Cg=None, file_Cn=None, delim_cg=",",
            delim_cn=","):
        r"""
        Reads a comma-separated files with [`P` x `P`] matrices of [`P`] trait
        covariance estimates.

        Arguments:
            file_Cg (string):
                [`P` x `P`] .csv file with [`P`] trait covariance estimates of
                the genetic component
            file_Cn (string):
                [`P` x `P`] .csv file with [`P`] trait covariance estimates of
                the non-genetic (noise) component

        Returns:
            None:
                updated the following attributes of the ReadData instance:

                - **self.Cg** (np.array):
                  [`P` x `P`] matrix with trait covariance of the genetic
                  component
                - **self.Cn** (np.array):
                  [`P` x `P`] matrix with trait covariance of the non-genetic
                  (noise) component

        Examples:

            .. doctest::

                >>> from pkg_resources import resource_filename
                >>> from limmbo.io.reader import ReadData
                >>> data = ReadData()
                >>> file_Cg = resource_filename('limmbo',
                ...                     'io/test/data/Cg.csv')
                >>> file_Cn = resource_filename('limmbo',
                ...                     'io/test/data/Cn.csv')
                >>> data.getVarianceComponents(file_Cg=file_Cg, file_Cn=file_Cn)
                >>> data.Cg.shape
                (10, 10)
                >>> data.Cn.shape
                (10, 10)
                >>> data.Cg[:3,:3]
                array([[ 0.45446454, -0.21084613,  0.01440468],
                       [-0.21084613,  0.11443656,  0.01250233],
                       [ 0.01440468,  0.01250233,  0.02347906]])
                >>> data.Cn[:3,:3]
                array([[ 0.53654803, -0.14392748, -0.45483001],
                       [-0.14392748,  0.88793093,  0.30539822],
                       [-0.45483001,  0.30539822,  0.97785614]])
        """

        if file_Cg is None and file_Cn is None:
            verboseprint(
                ("No variance components supplied, estimate variance components "
                 "before lmm test"),
                verbose=self.verbose)
        elif file_Cg is None or file_Cn is None:
            raise IOError('Both variant components need to be supplied:',
                          'Cg is %s and Cn is %s') % (file_Cg, file_Cn)
        else:
            try:
                self.Cg = np.array(
                    pd.io.parsers.read_csv(file_Cg, header=None, sep=delim_cg))
            except Exception:
                raise IOError('{} could not be opened'.format(file_Cg))
            try:
                self.Cn = np.array(
                    pd.io.parsers.read_csv(file_Cn, header=None, sep=delim_cn))
            except Exception:
                raise IOError('{} could not be opened'.format(file_Cn))
    
[docs]    def getTraitSubset(self, traitstring = None):
        """
        Limit analysis to specific subset of traits
        
        Arguments:
            traitstring (string):
                comma-separated trait numbers (for single traits) or hyphen-
                separated trait numbers (for trait ranges) or combination of
                both for trait selection (1-based)
        
        Returns:
            (numpy array)
                array containing list of trait IDs
        
        Examples:
        
            .. doctest::
        
                >>> from limmbo.io import reader
                >>> data = reader.ReadData(verbose=False)
                >>> traitlist = data.getTraitSubset("1,3,5,7-10")
                >>> print traitlist
                [0 2 4 6 7 8 9]
        """
        if traitstring is None:
            verboseprint('No trait subset chosen', verbose=self.verbose)
        else:
            verboseprint('Chose subset of {} traits'.format(traitstring), 
                    verbose=self.verbose)
            search = re.compile('[^0-9,-]').search
            if bool(search(traitstring)):
                raise FormatError(('Traitstring can only contain integers '
                        '(0-9), comma (,) and hyphen (-), but {}'
                        'provided').format(traitstring))
            traitslist = [x.split('-') for x in traitstring.split(',') ]
            traitsarray = []
            for t in traitslist:
                if len(t) == 1:
                    traitsarray.append(int(t[0]) - 1)
                else:
                    [traitsarray.append(x) for x in range(int(t[0]) - 1, int(t[1])) ]
            return np.array(traitsarray)

[docs]    def getSampleSubset(self, file_samplelist=None, samplelist=None):
        r"""
        Read file or string with subset of sample IDs to analyse.

        Arguments:
            file_samplelist (string):
                "path/to/file_samplelist": file contains subset sample IDs with
                one ID per line, no header.
            samplestring (string): 
                comma-separated list of sample IDs e.g. "ID1,ID2,ID5,ID10".

        Returns:
            (numpy array)
                array containing list of sample IDs
        """
        if file_samplelist is not None and samplelist is not None:
           raise IOError("Only one of file_samplelist or samplelist can "
                    "be specified")
        if file_samplelist is not None or samplelist is not None:
            if file_samplelist is not None:
                verboseprint("Read sample list from file", verbose=self.verbose)
                try:
                    samplelist = pd.io.parsers.read_csv(
                        file_samplelist, sep=" ", header=None, index_col=0)
                except Exception:
                    raise IOError('{} could not be opened'.format(
                        file_samplelist))
                samplelist = samplelist.index
            else:
                verboseprint("Split sample string", verbose=self.verbose)
                samplelist = samplelist.split(",")

            verboseprint("Number of samples in sample list: %s" %
                len(samplelist), verbose=self.verbose)
            return samplelist