Source code for limmbo.utils.utils

######################
### import modules ###
######################

import scipy as sp
import scipy.linalg as la
import scipy.stats as stats
import numpy as np
import pandas as pd
from distutils.util import strtobool
from math import sqrt

[docs]def boolanize(string): r""" Convert command line parameter "True"/"False" into boolean Arguments: string (string): "False" or "True" Returns: (bool): False/True """
return bool(strtobool(string))
[docs]def nans(shape): r""" Create numpy array of NaNs Arguments: shape (tuple): shape of the empty array Returns: (numpy array): numpy array of NaNs """ a = np.empty(shape, dtype=float) a.fill(np.nan)
return a
[docs]def scale(x): r""" Mean center and unit variance input array Arguments: x (array-like): array to be scaled by column Returns: (numpy array): mean-centered, unit-variance array of x """ x = np.array(x) x -= x.mean(axis=0) x /= x.std(axis=0)
return x
[docs]def getEigen(covMatrix, reverse=True): r""" Get eigenvectors and values of hermitian matrix: Arguments: covMatrix (array-like): hermitian matrix reverse (bool): if True (default): order eigenvalues (and vectors) in decreasing order Returns: (tuple): tuple containing: - eigenvectors - eigenvalues """ covMatrix = np.array(covMatrix) S, U = la.eigh(covMatrix) if reverse == True: S = S[::-1] U = U[:, ::-1]
return (S, U)
[docs]def getVariance(eigenvalue): r""" Based on input eigenvalue computes cumulative sum and normalizes to overall sum to obtain variance explained Arguments: eigenvalue (array-like): eigenvalues Returns: (float): variance explained """ v = np.array(eigenvalue).cumsum() v /= v.max()
return (v)
[docs]def regularize(m, verbose=True): r""" Make matrix positive-semi definite by ensuring minimum eigenvalue >= 0: add absolute value of minimum eigenvalue and 1e-4 (for numerical stability of abs(min(eigenvalue) < 1e-4 to diagonal of matrix Arguments: m (array-like): symmetric matrix Returns: (tuple): Returns tuple containing: - positive, semi-definite matrix from input m (numpy array) - minimum eigenvalue of input m """ S, U = la.eigh(m) minS = S.min() if minS < 0: verboseprint( "Regularizing: minimum Eigenvalue %6.4f" % S.min(), verbose=verbose) m += (abs(S.min()) + 1e-4) * sp.eye(m.shape[0]) elif minS < 1e-4: verboseprint("Make numerically stable: minimum Eigenvalue %6.4f" % \ S.min(), verbose=verbose) m += 1e-4 * sp.eye(m.shape[0]) else: verboseprint("Minimum Eigenvalue %6.4f" % S.min(), verbose=verbose)
return (m, minS)
[docs]def generate_permutation(P, S, n, seed=12321, exclude_zero=False): r""" Generate permutation. Arguments: seed (int): used as seed for pseudo-random numbers generation; default: 12321 n (int): number of permutations to generated P (int): total number of traits S (int): subsampling size exclude_zero (bool): should zero be in set to draw from Returns: (list): Returns list of length n containing [np.arrays] of length [`S`] with subsets/permutations of numbers range(P) """ rand_state = np.random.RandomState(seed) return_list = [None] * n if exclude_zero: rangeP = range(P)[1:] else: rangeP = range(P) for i in xrange(n): perm_dic = rand_state.choice(a=rangeP, size=p, replace=False) return_list[i] = perm_dic
return return_list
[docs]def inflate_matrix(bootstrap_traits, bootstrap, P, zeros=True): r""" Project small matrix into large matrix using indeces provided: Arguments: bootstrap_traits (array-like): [`S` x `S`] covariance matrix estimates bootstrap (array-like): [`S` x 1] array with indices to project [`S` x `S`] matrix values into [`P` x `P`] matrix P (int): total number of dimensions zeros (bool): fill void spaces in large matrix with zeros (True, default) or nans (False) Returns: (numpy array): Returns [`P` x `P`] matrix containing [`S` x `S`] matrix values at bootstrap indeces and zeros/nans elswhere """ index = np.ix_(np.array(bootstrap), np.array(bootstrap)) if zeros is True: all_traits = np.zeros((P, P)) else: all_traits = nans((P, P)) all_traits[index] = bootstrap_traits
return (all_traits)
[docs]def verboseprint(message, verbose=True): r""" Print message if verbose option is True. Arguments: message (string): text to print verbose (bool): flag whether to print message (True) or not (False) """ if verbose is True:
print message
[docs]def match(samples_ref, data_compare, samples_compare, squarematrix=False): r""" Match the order of data and ID matrices to a reference sample order, Arguments: samples_ref (array-like): [`M`] sammple Ids used as reference data_compare (array-like): [`N` x `L`] data matrix with [`N`] samples and [`L`] columns samples_compare (array-like): [`N`] sample IDs to be matched to samples_ref squarematrix (bool): is data_compare a square matrix i.e. samples in cols and rows Returns: (tuple): tuple containing: - data_compare (numpy array): [`M` x `L`] data matrix of input data_compare - samples_compare (numpy array): [`M`] sample IDs of input samples_compare - samples_before (int): number of samples in data_compare/samples_compare before matching to samples_ref - samples_after (int): number of samples in data_compare/samples_compare after matching to samples_ref """ samples_before = samples_compare.shape[0] subset = pd.match(samples_ref, samples_compare) data_compare = data_compare[subset, :] if squarematrix: data_compare = data_compare[:, subset] samples_compare = samples_compare[subset] samples_after = samples_compare.shape[0] np.testing.assert_array_equal( samples_ref, samples_compare, err_msg=("Col order does not match. These" "are the differing columns:\t%s") % (np.array_str(np.setdiff1d(samples_ref, samples_compare))))
return (data_compare, samples_compare, samples_before, samples_after) def AlleleFrequencies(snp): hc_snps = np.array([makeHardCalledGenotypes(s) for s in snp]) counts = np.array(np.unique(hc_snps, return_counts=True)) frequencies = counts[1, :] / float(len(hc_snps)) major_a = sqrt(frequencies.max()) minor_a = 1 - major_a return minor_a, major_a def makeHardCalledGenotypes(snp): if snp <= 0.5: return 0 elif snp > 1.5: return 2 else: return 1 def effectiveTests(test): # 1. get correlation matrix corr_matrix = stats.spearmanr(test).correlation # 2. Get eigenvalues of correlation matrix: eigenval, eigenvec = la.eigh(corr_matrix) # 3. Determine effective number of tests: t = np.sqrt(eigenval).sum()**2 / eigenval.sum() return t