Source code for pytransit.norm_tools

import sys
import numpy
import scipy.stats
import scipy.optimize
import warnings

[docs]class NormMethod: name = "undefined"
[docs] @staticmethod def normalize(): raise NotImplemented
[docs]class NZMeanNorm(NormMethod): name = "nzmean"
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): """Returns the normalization factors for the data, using the NZMean method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. Returns: numpy array: Array with the normalization factors for the nzmean method. :Example: >>> import pytransit._tools.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.nzmean_factors(data) >>> print(factors) array([[ 1.14836149], [ 0.88558737]]) .. seealso:: :class:`normalize_data` """ (K,N) = data.shape total_hits = numpy.sum(data,1) TAs_hit = numpy.sum(data > 0, 1) mean_hits = total_hits/TAs_hit grand_total = numpy.sum(mean_hits) grand_mean = grand_total/float(K) factors = numpy.zeros((K,1)) factors[:,0] = grand_mean/mean_hits data = factors * data return (data, factors)
[docs]class TotReadsNorm(NormMethod): name = "totreads"
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): """Returns the normalization factors for the data, using the total reads method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. Returns: numpy array: Array with the normalization factors for the totreads method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.totreads_factors(data) >>> print(factors) array([[ 1.2988762], [ 0.8129396]]) .. seealso:: :class:`normalize_data` """ (K,N) = data.shape total_hits = numpy.sum(data,1) TAs = float(N) mean_hits = total_hits/TAs grand_total = numpy.sum(mean_hits) grand_mean = grand_total/float(K) factors = numpy.zeros((K,1)) factors[:,0] = grand_mean/mean_hits data = factors * data return (data, factors)
[docs]class TTRNorm(NormMethod): name = "emphist"
[docs] def empirical_theta(X): """Calculates the observed density of the data. This is used as an estimate insertion density by some normalization methods. May be improved by more sophisticated ways later on. Arguments: data (numpy array): (N) numpy array defining read-counts at N sites. Returns: float: Density of the given dataset. :Example: >>> import pytransit.tnseq_tools as tnseq_tools >>> import pytransit.norm_tools as norm_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> theta = norm_tools.empirical_theta(data) >>> print(theta) 0.467133570136 .. seealso:: :class:`TTR_factors` """ return numpy.mean(X > 0)
[docs] def trimmed_empirical_mu(X, t=0.05): """Estimates the trimmed mean of the data. This is used as an estimate of mean count by some normalization methods. May be improved by more sophisticated ways later on. Arguments: data (numpy array): (N) numpy array defining read-counts at N sites. t (float): Float specifying fraction of start and end to trim. Returns: float: (Trimmed) Mean of the given dataset. :Example: >>> import pytransit.tnseq_tools as tnseq_tools >>> import pytransit.norm_tools as norm_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> mu = norm_tools.trimmed_empirical_mu(data) >>> print(mu) 120.73077107 .. seealso:: :class:`TTR_factors` """ return scipy.stats.trim_mean(X[X > 0], t)
[docs] @staticmethod def normalize(data, wigList=[], annotationPath="", thetaEst=empirical_theta, muEst=trimmed_empirical_mu, target=100.0): """Returns the normalization factors for the data, using the TTR method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. thetaEst (function): Function used to estimate density. Should take a list of counts as input. muEst (function): Function used to estimate mean count. Should take a list of counts as input. Returns: numpy array: Array with the normalization factors for the TTR method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.TTR_factors(data) >>> print(factors) array([[ 1. ], [ 0.62862886]]) .. seealso:: :class:`normalize_data` """ K = len(data) N = len(data[0]) factors = numpy.zeros((K,1)) for j in range(K): factors[j] = float(target)/(thetaEst(data[j]) * muEst(data[j])) data = factors * data return (data, factors)
[docs]class EmpHistNorm(NormMethod): name = "emphist"
[docs] @staticmethod def Fzinfnb(params, args): """Objective function for the zero-inflated NB method.""" pi, mu, r = params Fdata = args temp0 = numpy.nan_to_num(numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata==0], mu, r))) tempnz = numpy.nan_to_num(numpy.log(1.0-pi)+scipy.stats.nbinom.logpmf(Fdata[Fdata>0], mu, r)) negLL = -(numpy.sum(temp0) + numpy.sum(tempnz)) return negLL
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): """Returns the normalized data, using the empirical hist method. Arguments: wigList (list): List of paths to wig formatted datasets. annotationPath (str): Path to annotation in .prot_table or GFF3 format. Returns: numpy array: Array with the normalization factors for the emphist method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.emphist_factors(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"], "transit/genomes/H37Rv.prot_table") >>> print(factors) array([[ 1. ], [ 0.63464722]]) .. seealso:: :class:`normalize_data` """ from pytransit import tnseq_tools G = tnseq_tools.Genes(wigList, annotationPath) K = len(wigList) temp = [] for j in range(K): reads_per_gene = [] for gene in G: tempdata = numpy.array(gene.reads) if len(tempdata[0]) > 0: reads_per_gene.append(numpy.sum(tempdata[j,:])) temp.append(reads_per_gene) temp = numpy.array(temp) factors = numpy.ones((K,1)) for j in range(1, K): ii_good = numpy.logical_and(temp[0,:] > 0, temp[j,:] > 0) logFC = numpy.log(temp[j,ii_good]/temp[0,ii_good]) mean = numpy.mean(logFC) std = numpy.sqrt(numpy.var(logFC)) X = numpy.linspace(mean - (5*std), mean + (std*5), 50000) R = scipy.stats.gaussian_kde(logFC) Y = R(X) peakLogFC = X[Y.argmax()] if peakLogFC < 0: factors[j,0] = numpy.exp(abs(peakLogFC)) else: factors[j,0] = 1.0/numpy.exp(abs(peakLogFC)) data = factors * data return (data, factors)
[docs]class AdaptiveBGCNorm(NormMethod): name = "aBGC"
[docs] def ecdf(S, x): """Calculates an empirical CDF of the given data.""" return numpy.sum(S<=x)/float(len(S))
[docs] def cleaninfgeom(x, rho): """Returns a 'clean' output from the geometric distribution.""" if x == float('inf'): return scipy.stats.geom.ppf(0.9999999999999999, rho) else: return x
[docs] @staticmethod def normalize(data, wigList=[], annotationPath="", doTotReads = True, bgsamples = 200000): """Returns the normalized data using the aBGC method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. doTotReads (bool): Boolean specifying whether to do TTR normalization as well. bgsamples (int): Integeer specifying how many samples to take. Returns: numpy array: Array with the normalized data. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> normdata = norm_tools.aBGC_norm(data) >>> print(normdata) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) .. seealso:: :class:`normalize_data` """ K,N = data.shape norm_data = numpy.zeros(data.shape) S = bgsamples F = [i/100.0 for i in range(0,31) if i % 2 == 0] BGC = [] param_list = [] bgc_factors = [] for j in range(K): nzdata = data[j][data[j] > 0] nzdata.sort() Nall = len(data[j]) Nnz = len(nzdata) GOF_list = [] for frac in F: tQ = numpy.arange(0,Nnz)/float(Nnz) rho = 1.0/(scipy.stats.trim_mean(nzdata, frac)) rho_to_fit = rho try: A = (numpy.sum(numpy.power(numpy.log(1.0-tQ),2)))/(numpy.sum(nzdata*numpy.log(1.0-tQ))) Kp = (2.0 * numpy.exp(A) - 1) /(numpy.exp(A) + rho - 1) temp = scipy.stats.geom.rvs(scipy.stats.beta.rvs(Kp*rho, Kp*(1-rho), size=S), size=S) bgc_factors.append((rho, Kp)) except Except as e: print("aBGC Error:", str(e)) print("%rho=s\tKp=%s\tA=%s" % (rho, Kp, A)) temp = scipy.stats.geom.rvs(0.01, size=S) corrected_nzdata = [cleaninfgeom(scipy.stats.geom.ppf(ecdf(temp, x), rho_to_fit), rho_to_fit) for x in nzdata] corrected_nzmean = numpy.mean(corrected_nzdata) Fp = scipy.stats.geom.ppf(numpy.arange(1,Nnz+1)/float(Nnz), 1.0/corrected_nzmean) ii_inf = Fp == float("inf") Fp[ii_inf] = max(Fp[~ii_inf]) + 100 ch2_indiv = numpy.power(corrected_nzdata- Fp, 2)/ Fp GOF = max(ch2_indiv) GOF_list.append((GOF, frac, rho_to_fit, Kp)) gof, frac, best_rho, best_Kp = sorted(GOF_list)[0] BGsample = scipy.stats.geom.rvs(scipy.stats.beta.rvs(best_Kp*best_rho, best_Kp*(1-best_rho), size=S), size=S) #BGC.append(dict([(x, removeinf(scipy.stats.geom.ppf(ecdf(temp, x), best_rho), best_rho)) for x in data[j]])) for i in range(N): norm_data[j,i] = cleaninfgeom(scipy.stats.geom.ppf(ecdf(BGsample, data[j,i]), best_rho), best_rho) if doTotReads: (norm_data, factors) = TTRNorm.normalize(norm_data) return (norm_data, bgc_factors)
[docs]class ZeroInflatedNBNorm(NormMethod): name = "zinfb"
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): """Returns the normalization factors for the data using the zero-inflated negative binomial method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. Returns: numpy array: Array with the normalization factors for the zinfnb method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.zinfnb_factors(data) >>> print(factors) [[ 0.0121883 ] [ 0.00747111]] .. seealso:: :class:`normalize_data` """ N = len(data) G = len(data[0]) factors = numpy.zeros((N, 1)) for j in range(N): initParams = [0.3, 10, 0.5] M = "L-BFGS-B" Fdata = numpy.array(data[j]) results = scipy.optimize.minimize(Fzinfnb, initParams, args=(Fdata,), method=M, bounds=[(0.0001, 0.9999),(0.0001, None),(0.0001, 0.9999)]) pi, n, p = results.x mu = n*(1-p)/p factors[j,0] = 1.0/mu data = factors * data return (data, factors)
[docs]class QuantileNorm(NormMethod): name = "quantile"
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): """Performs Quantile Normalization as described by Bolstad et al. 2003 Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. Returns: numpy array: Array with the data normalized by the quantile normalization method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> normdata = norm_tools.quantile_norm(data) >>> print(normdata) .. seealso:: :class:`normalize_data` """ N = len(data) G = len(data[0]) #Sort columns s_data = numpy.array([sorted(col) for col in data]) #Get ranks of original data ranks = numpy.zeros(data.shape, dtype=int) for j in range(N): ranks[j,:] = scipy.stats.rankdata(data[j], method='dense') #Get empirical distribution ranked_means = numpy.mean(s_data,0) #Create dictionary of rank to new empirical values rank2count = dict([(r,c) for (r,c) in zip(scipy.stats.rankdata(ranked_means, method='dense'), ranked_means)]) #Assign values norm_data = numpy.zeros(data.shape) for i in range(G): norm_data[:,i] = [rank2count[ranks[j,i]] for j in range(N)] return (norm_data, numpy.ones(1))
[docs]class BetaGeomNorm(NormMethod): name = "betageom"
[docs] def ecdf(S, x): """Calculates an empirical CDF of the given data.""" return numpy.sum(S<=x)/float(len(S))
[docs] def cleaninfgeom(x, rho): """Returns a 'clean' output from the geometric distribution.""" if x == float('inf'): return scipy.stats.geom.ppf(0.9999999999999999, rho) else: return x
[docs] @staticmethod def normalize(data, wigList=[], annotationPath="", doTTR = True, bgsamples=200000): """Returns normalized data according to the BGC method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. doTTR (bool): Boolean specifying whether to do TTR norm as well. bgsamples (int): Integer specifying how many samples to take. Returns: numpy array: Array with the data normalized using the betageom method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> normdata = norm_tools.betageom_norm(data) >>> print(normdata) [[ 0. 0. 0. ..., 0. 0. 0.] [ 0. 0. 0. ..., 0. 0. 0.]] .. seealso:: :class:`normalize_data` """ (K,N) = data.shape total_hits = numpy.sum(data,1) TAs_hit = numpy.sum(data > 0,1) mean_hits = total_hits/TAs_hit grand_total = numpy.sum(mean_hits) grand_mean = grand_total/float(K) norm_data = numpy.zeros(data.shape) bgc_factors = [] for j in range(K): tQ = numpy.arange(0,N)/float(N) eX = numpy.array([rd for rd in data[j]]) eX.sort() rho = max(1.0/scipy.stats.trim_mean(eX+1, 0.001), 0.0001) A = (numpy.sum(numpy.power(numpy.log(1.0-tQ),2)))/(numpy.sum(eX*numpy.log(1.0-tQ))) Kp = max((2.0 * numpy.exp(A) - 1) /(numpy.exp(A) + rho - 1), 10) bgc_factors.append((rho,Kp)) try: BGsample = scipy.stats.geom.rvs(scipy.stats.beta.rvs(Kp*rho, Kp*(1-rho), size=bgsamples), size=bgsamples) except Exception as e: print("BGC ERROR with rho=%f, Kp=%f, A=%s" % (rho, Kp, A)) print(str(e)) BGsample = scipy.stats.geom.rvs(rho, size=bgsamples) for i in range(N): norm_data[j,i] = cleaninfgeom(scipy.stats.geom.ppf(ecdf(BGsample, data[j,i]), 1.0/grand_mean), 1.0/grand_mean) if doTTR: (norm_data, factors) = TTRNorm.normalize(norm_data) return (norm_data, bgc_factors)
[docs]class NoNorm(NormMethod): name = "nonorm"
[docs] @staticmethod def normalize(data, wigList=[], annotationPath=""): return (data, numpy.ones(1))
methods = {} methods["nonorm"] = NoNorm methods["TTR"] = TTRNorm methods["nzmean"] = NZMeanNorm methods["totreads"] = TotReadsNorm methods["betageom"] = BetaGeomNorm methods["zinfnb"] = ZeroInflatedNBNorm methods["quantile"] = QuantileNorm methods["aBGC"] = AdaptiveBGCNorm methods["emphist"] = EmpHistNorm #########################
[docs]def normalize_data(data, method="nonorm", wigList=[], annotationPath=""): """Normalizes the numpy array by the given normalization method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. method (str): Name of the desired normalization method. wigList (list): List of paths for the desired wig-formatted datasets. annotationPath (str): Path to the prot_table annotation file. Returns: numpy array: Array with the normalized data. list: List containing the normalization factors. Empty if not used. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) (normdata, normfactors) = norm_tools.normalize_data(data, "TTR") # Some methods require annotation and path to wig files. >>> print(normfactors) array([[ 1. ], [ 0.62862886]]) >> print(normdata) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) .. note:: Some normalization methods require the wigList and annotationPath arguments. """ factors = [] if method in methods: return methods[method].normalize(data, wigList, annotationPath) else: warnstr = "Normalization method '%s' is unknown. Read-counts were not normalized." % (method) warnings.warn(warnstr) return methods["nonorm"].normalize(data, wigList, annotationPath)
[docs]def empirical_theta(X): """Calculates the observed density of the data. This is used as an estimate insertion density by some normalization methods. May be improved by more sophisticated ways later on. Arguments: data (numpy array): (N) numpy array defining read-counts at N sites. Returns: float: Density of the given dataset. :Example: >>> import pytransit.tnseq_tools as tnseq_tools >>> import pytransit.norm_tools as norm_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> theta = norm_tools.empirical_theta(data) >>> print(theta) 0.467133570136 .. seealso:: :class:`TTR_factors` """ return numpy.mean(X > 0)
[docs]def trimmed_empirical_mu(X, t=0.05): """Estimates the trimmed mean of the data. This is used as an estimate of mean count by some normalization methods. May be improved by more sophisticated ways later on. Arguments: data (numpy array): (N) numpy array defining read-counts at N sites. t (float): Float specifying fraction of start and end to trim. Returns: float: (Trimmed) Mean of the given dataset. :Example: >>> import pytransit.tnseq_tools as tnseq_tools >>> import pytransit.norm_tools as norm_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> mu = norm_tools.trimmed_empirical_mu(data) >>> print(mu) 120.73077107 .. seealso:: :class:`TTR_factors` """ return scipy.stats.trim_mean(X[X > 0], t)
[docs]def Fzinfnb(params, args): """Objective function for the zero-inflated NB method.""" pi, mu, r = params Fdata = args temp0 = numpy.nan_to_num(numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata==0], mu, r))) tempnz = numpy.nan_to_num(numpy.log(1.0-pi)+scipy.stats.nbinom.logpmf(Fdata[Fdata>0], mu, r)) negLL = -(numpy.sum(temp0) + numpy.sum(tempnz)) return negLL
[docs]def zinfnb_factors(data): """Returns the normalization factors for the data using the zero-inflated negative binomial method. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. Returns: numpy array: Array with the normalization factors for the zinfnb method. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.zinfnb_factors(data) >>> print(factors) [[ 0.0121883 ] [ 0.00747111]] .. seealso:: :class:`normalize_data` """ N = len(data) G = len(data[0]) factors = numpy.zeros((N, 1)) for j in range(N): initParams = [0.3, 10, 0.5] M = "L-BFGS-B" Fdata = numpy.array(data[j]) results = scipy.optimize.minimize(Fzinfnb, initParams, args=(Fdata,), method=M, bounds=[(0.0001, 0.9999),(0.0001, None),(0.0001, 0.9999)]) pi, n, p = results.x mu = n*(1-p)/p factors[j,0] = 1.0/mu return numpy.array(factors)
#
[docs]def ecdf(S, x): """Calculates an empirical CDF of the given data.""" return numpy.sum(S<=x)/float(len(S))
[docs]def cleaninfgeom(x, rho): """Returns a 'clean' output from the geometric distribution.""" if x == float('inf'): return scipy.stats.geom.ppf(0.9999999999999999, rho) else: return x
#
[docs]def norm_to_target(data, target): """Returns factors to normalize the data to the given target value. Arguments: data (numpy array): (K,N) numpy array defining read-counts at N sites for K datasets. target (float): Floating point specifying the target for the mean of the data/ Returns: numpy array: Array with the factors necessary to normalize mean to target. :Example: >>> import pytransit.norm_tools as norm_tools >>> import pytransit.tnseq_tools as tnseq_tools >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"]) >>> print(data) array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]) >>> factors = norm_tools.norm_to_target(data, 100) >>> print(factors) [[ 1.8548104 ] [ 1.16088726]] .. seealso:: :class:`normalize_data` """ (K,N) = data.shape factors = numpy.zeros((K,1)) factors[:,0] = float(target)/numpy.mean(data,1) return factors