import scipy as sp
import pandas as pd
import numpy as np
import re
from limix.io import read_plink
from limmbo.utils.utils import verboseprint
from limmbo.io.utils import file_type
class MissingInput(Exception):
"""Raised when no appropriate input is given"""
pass
class FormatError(Exception):
"""Raised when no appropriate input is given"""
pass
class DataMismatch(Exception):
"""Raised when dimensions of sample/ID names do not match dimension of
corresponding data"""
pass
[docs]class ReadData(object):
r"""
Generate object containing all datasets relevant for the analysis.
For variance decomposition, at least phenotypes and relatedness estimates
need to be specified.
For association testing with LMM, at least phenotype, relatedness estimates
and genotypes need to be read.
"""
def __init__(self, verbose=True):
self.verbose = verbose
self.samples = None
self.phenotypes = None
self.pheno_samples = None
self.phenotype_ID = None
self.covariates = None
self.covs_samples = None
self.relatedness = None
self.relatedness_samples = None
self.genotypes = None
self.geno_samples = None
self.genotypes_info = None
self.pcs = None
self.pc_samples = None
self.snps = None
self.geno_samples = None
self.position = None
self.Cg = None
self.Cn = None
[docs] def getPhenotypes(self, file_pheno=None, delim=","):
r"""
Reads [`N` x `P`] phenotype file; file ending must be either .txt or
.csv
Arguments:
file_pheno (string):
path to [(`N`+1) x (`P`+1)] phenotype file with: [`N`] sample
IDs in the first column and [`P`] phenotype IDs in the first row
delim (string):
delimiter of phenotype file, one of " ", ",", "\t"
Returns:
None:
updated the following attributes of the ReadData instance:
- **self.phenotypes** (np.array):
[`N` x `P`] phenotype matrix
Examples:
.. doctest::
>>> from pkg_resources import resource_filename
>>> from limmbo.io.reader import ReadData
>>> from limmbo.io.utils import file_type
>>> data = ReadData(verbose=False)
>>> file_pheno = resource_filename('limmbo',
... 'io/test/data/pheno.csv')
>>> data.getPhenotypes(file_pheno=file_pheno)
>>> data.phenotypes.index[:3]
Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
>>> data.phenotypes.columns[:3]
Index([u'trait_1', u'trait_2', u'trait_3'], dtype='object')
>>> data.phenotypes.values[:3,:3]
array([[-1.56760036, -1.5324513 , 1.17789321],
[-0.85655034, 0.48358151, 1.35664966],
[ 0.10772832, -0.02262884, -0.27963328]])
"""
if file_pheno is None:
raise MissingInput('No phenotype file specified')
else:
if file_type(file_pheno) is not "delim":
raise FormatError('Supplied phenotype file is neither .csv '
'nor .txt')
verboseprint(
"Reading phenotypes from %s" % file_pheno, verbose=self.verbose)
try:
self.phenotypes = pd.io.parsers.read_csv(
file_pheno, index_col=0, sep=delim)
except Exception:
raise IOError('{} could not be opened'.format(file_pheno))
[docs] def getCovariates(self, file_covariates=None, delim=','):
r"""
Reads [`N` x `K`] covariate matrix with [`N`] samples and [`K`]
covariates.
Arguments:
file_covariates (string):
[`N` x (`K` +1)] covariates file with [`N`] sample IDs in the
first column
delim (string):
delimiter of covariates file, one of " ", ",", "\t"
Returns:
None:
updated the following attributes of the ReadData instance:
- **self.covariates** (np.array):
[`N` x `K`] covariates matrix
Examples:
.. doctest::
>>> from pkg_resources import resource_filename
>>> from limmbo.io.reader import ReadData
>>> from limmbo.io.utils import file_type
>>> data = ReadData(verbose=False)
>>> file_covs = resource_filename('limmbo',
... 'io/test/data/covs.csv')
>>> data.getCovariates(file_covariates=file_covs)
>>> data.covariates.index[:3]
Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object')
>>> data.covariates.values[:3,:3]
array([[ 0.92734699, 1.59767659, -0.67263682],
[ 0.57061985, -0.84679736, -1.11037123],
[ 0.44201204, -1.61499228, 0.23302345]])
"""
if file_covariates is not None:
if file_type(file_covariates) is not 'delim':
raise FormatError('Supplied covariate file is not .csv or .txt')
try:
self.covariates = pd.io.parsers.read_csv(file_covariates,
sep=delim, index_col=0)
verboseprint("Reading covariates file", verbose=self.verbose)
except Exception:
raise IOError('{} could not be opened'.format(file_covariates))
# append column of 1's to adjust for mean of covariates
self.covariates = pd.concat([self.covariates, pd.DataFrame(sp.ones(
self.covariates.shape[0]), index=self.covariates.index)],
axis=1)
else:
verboseprint("No covariates set", verbose=self.verbose)
self.covariates = None
verboseprint("Reading relationship matrix", verbose=self.verbose)
[docs] def getPCs(self, file_pcs=None, nrpcs=None, delim=","):
r"""
Reads file with [`N` x `PC`] matrix of [`PC`] principal components from
the genotypes of [`N`] samples.
Arguments:
file_pcs (string):
[`N` x (`PC` +1)] PCA file with [`N`] sample IDs in the first
column
delim (string):
delimiter of PCA file, one of " ", ",", "\t"
nrpcs (integer):
Number of PCs to use (uses first nrpcs principal components)
Returns:
None:
updated the following attributes of the ReadData instance:
- **self.pcs** (np.array):
[`N` x `PC`] principal component matrix
Examples:
.. doctest::
>>> from pkg_resources import resource_filename
>>> from limmbo.io.reader import ReadData
>>> from limmbo.io.utils import file_type
>>> data = ReadData(verbose=False)
>>> file_pcs = resource_filename('limmbo',
... 'io/test/data/pcs.csv')
>>> data.getPCs(file_pcs=file_pcs, nrpcs=10, delim=" ")
>>> data.pcs.index[:3]
Index([u'ID_1', u'ID_2', u'ID_3'], dtype='object', name=0)
>>> data.pcs.values[:3,:3]
array([[-0.02632738, -0.05791269, -0.03619099],
[ 0.00761785, 0.00538956, 0.00624196],
[ 0.01069307, -0.0205066 , 0.02299996]])
"""
if file_type(file_pcs) is not 'delim':
raise FormatError('Supplied PCA file is not .csv or .txt')
if file_pcs is not None:
verboseprint("Reading PCs", verbose=self.verbose)
self.pcs = pd.io.parsers.read_csv(file_pcs, index_col=0,
sep=delim, header=None)
if nrpcs is not None:
verboseprint(
"Extracting first %s pcs" % nrpcs, verbose=self.verbose)
self.pcs = self.pcs.iloc[:, :nrpcs]
else:
verboseprint("No pcs set", verbose=self.verbose)
self.pcs = None
[docs] def getGenotypes(self, file_genotypes=None, delim = ','):
r"""
Reads genotype file in the following formats: plink (.bed, .bim, .fam),
gen (.gen, .sample) or comma-separated values (.csv) file.
Arguments:
file_geno (string):
path to phenotype file in .plink or .csv format
- **plink format**:
as specified in the plink `user manual <https://www.cog-genomics.org/plink/1.9/input>`_,
binary plink format with .bed, .fam and .bim file
- **.csv format**:
- [(`NrSNP` + 1) x (`N`+1)] .csv file with: [`N`] sample IDs
in the first row and [`NrSNP`] genotype IDs in the first
column
- sample IDs should be of type: chrom-pos-rsID for instance
22-50714616-rs6010226
delim (string):
delimiter of genotype file (when text format), one of " ", ",",
"\t"
Returns:
None:
updated the following attributes of the ReadData instance:
- **self.genotypes** (np.array):
[`N` x `NrSNPs`] genotype matrix
- **self.genotypes_info** (pd.dataframe):
[`NrSNPs` x 2] dataframe with columns 'chrom' and 'pos', and
rsIDs as index
Examples:
.. doctest::
>>> from pkg_resources import resource_filename
>>> from limmbo.io import reader
>>> from limmbo.io.utils import file_type
>>> data = reader.ReadData(verbose=False)
>>> # Read genotypes in delim-format
>>> file_geno = resource_filename('limmbo',
... 'io/test/data/genotypes.csv')
>>> data.getGenotypes(file_genotypes=file_geno)
>>> data.genotypes.index[:4]
Index([u'ID_1', u'ID_2', u'ID_3', u'ID_4'], dtype='object')
>>> data.genotypes.shape
(1000, 20)
>>> data.genotypes.values[:5,:5]
array([[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[2., 1., 0., 0., 0.],
[1., 0., 0., 0., 0.]])
>>> data.genotypes_info[:5]
chrom pos
rs1601111 3 88905003
rs13270638 8 20286021
rs75132935 8 76564608
rs72668606 8 79733124
rs55770986 7 2087823
>>> ### read genotypes in plink format
>>> file_geno = resource_filename('limmbo',
... 'io/test/data/genotypes')
>>> data.getGenotypes(file_genotypes=file_geno)
>>> data.genotypes.values[:5,:5]
array([[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[2., 1., 0., 0., 0.],
[1., 0., 0., 0., 0.]])
>>> data.genotypes_info[:5]
chrom pos
rs1601111 3 88905003
rs13270638 8 20286021
rs75132935 8 76564608
rs72668606 8 79733124
rs55770986 7 2087823
"""
if file_genotypes is None:
raise MissingInput('No genotypes data specified')
if file_type(file_genotypes) not in ['delim', 'bed']:
raise FormatError(('Supplied genotype file is neither in plink nor '
'.csv/.txt format'))
verboseprint("Reading genotypes from %s" % file_genotypes,
verbose=self.verbose)
if file_type(file_genotypes) is 'delim':
try:
genotypes = pd.io.parsers.read_csv(
file_genotypes, sep=delim, index_col=0, header=0)
except Exception:
raise IOError('{} could not be opened'.format(file_genotypes))
info = np.array(genotypes.index)
genotypes_info = []
snp_ID = []
for id in range(info.shape[0]):
split = np.array(info[id].split('-'))
snp_ID.append(split[2])
genotypes_info.append(split[[0, 1]])
self.genotypes = genotypes.astype(float).T
self.genotypes_info = pd.DataFrame(
np.array(genotypes_info),
columns=['chrom', 'pos'],
index=snp_ID)
if file_type(file_genotypes) is 'bed':
try:
(bim, fam, bed) = read_plink(file_genotypes,
verbose=self.verbose)
except Exception:
raise IOError('{} could not be opened'.format(file_genotypes))
self.genotypes = pd.DataFrame(bed.compute()).astype(float).T
self.genotypes.index = fam.iid
self.genotypes_info = pd.DataFrame(
np.array([bim.chrom, bim.pos]).T,
columns=['chrom', 'pos'],
index=bim.snp)
self.genotypes_info.index.name = None
[docs] def getVarianceComponents(self, file_Cg=None, file_Cn=None, delim_cg=",",
delim_cn=","):
r"""
Reads a comma-separated files with [`P` x `P`] matrices of [`P`] trait
covariance estimates.
Arguments:
file_Cg (string):
[`P` x `P`] .csv file with [`P`] trait covariance estimates of
the genetic component
file_Cn (string):
[`P` x `P`] .csv file with [`P`] trait covariance estimates of
the non-genetic (noise) component
Returns:
None:
updated the following attributes of the ReadData instance:
- **self.Cg** (np.array):
[`P` x `P`] matrix with trait covariance of the genetic
component
- **self.Cn** (np.array):
[`P` x `P`] matrix with trait covariance of the non-genetic
(noise) component
Examples:
.. doctest::
>>> from pkg_resources import resource_filename
>>> from limmbo.io.reader import ReadData
>>> data = ReadData()
>>> file_Cg = resource_filename('limmbo',
... 'io/test/data/Cg.csv')
>>> file_Cn = resource_filename('limmbo',
... 'io/test/data/Cn.csv')
>>> data.getVarianceComponents(file_Cg=file_Cg, file_Cn=file_Cn)
>>> data.Cg.shape
(10, 10)
>>> data.Cn.shape
(10, 10)
>>> data.Cg[:3,:3]
array([[ 0.45446454, -0.21084613, 0.01440468],
[-0.21084613, 0.11443656, 0.01250233],
[ 0.01440468, 0.01250233, 0.02347906]])
>>> data.Cn[:3,:3]
array([[ 0.53654803, -0.14392748, -0.45483001],
[-0.14392748, 0.88793093, 0.30539822],
[-0.45483001, 0.30539822, 0.97785614]])
"""
if file_Cg is None and file_Cn is None:
verboseprint(
("No variance components supplied, estimate variance components "
"before lmm test"),
verbose=self.verbose)
elif file_Cg is None or file_Cn is None:
raise IOError('Both variant components need to be supplied:',
'Cg is %s and Cn is %s') % (file_Cg, file_Cn)
else:
try:
self.Cg = np.array(
pd.io.parsers.read_csv(file_Cg, header=None, sep=delim_cg))
except Exception:
raise IOError('{} could not be opened'.format(file_Cg))
try:
self.Cn = np.array(
pd.io.parsers.read_csv(file_Cn, header=None, sep=delim_cn))
except Exception:
raise IOError('{} could not be opened'.format(file_Cn))
[docs] def getTraitSubset(self, traitstring = None):
"""
Limit analysis to specific subset of traits
Arguments:
traitstring (string):
comma-separated trait numbers (for single traits) or hyphen-
separated trait numbers (for trait ranges) or combination of
both for trait selection (1-based)
Returns:
(numpy array)
array containing list of trait IDs
Examples:
.. doctest::
>>> from limmbo.io import reader
>>> data = reader.ReadData(verbose=False)
>>> traitlist = data.getTraitSubset("1,3,5,7-10")
>>> print traitlist
[0 2 4 6 7 8 9]
"""
if traitstring is None:
verboseprint('No trait subset chosen', verbose=self.verbose)
else:
verboseprint('Chose subset of {} traits'.format(traitstring),
verbose=self.verbose)
search = re.compile('[^0-9,-]').search
if bool(search(traitstring)):
raise FormatError(('Traitstring can only contain integers '
'(0-9), comma (,) and hyphen (-), but {}'
'provided').format(traitstring))
traitslist = [x.split('-') for x in traitstring.split(',') ]
traitsarray = []
for t in traitslist:
if len(t) == 1:
traitsarray.append(int(t[0]) - 1)
else:
[traitsarray.append(x) for x in range(int(t[0]) - 1, int(t[1])) ]
return np.array(traitsarray)
[docs] def getSampleSubset(self, file_samplelist=None, samplelist=None):
r"""
Read file or string with subset of sample IDs to analyse.
Arguments:
file_samplelist (string):
"path/to/file_samplelist": file contains subset sample IDs with
one ID per line, no header.
samplestring (string):
comma-separated list of sample IDs e.g. "ID1,ID2,ID5,ID10".
Returns:
(numpy array)
array containing list of sample IDs
"""
if file_samplelist is not None and samplelist is not None:
raise IOError("Only one of file_samplelist or samplelist can "
"be specified")
if file_samplelist is not None or samplelist is not None:
if file_samplelist is not None:
verboseprint("Read sample list from file", verbose=self.verbose)
try:
samplelist = pd.io.parsers.read_csv(
file_samplelist, sep=" ", header=None, index_col=0)
except Exception:
raise IOError('{} could not be opened'.format(
file_samplelist))
samplelist = samplelist.index
else:
verboseprint("Split sample string", verbose=self.verbose)
samplelist = samplelist.split(",")
verboseprint("Number of samples in sample list: %s" %
len(samplelist), verbose=self.verbose)
return samplelist