Source code for pytransit.norm_tools

import sys
import numpy
import scipy.stats
import scipy.optimize
import warnings

[docs]class NormMethod:
    name = "undefined"
[docs]    @staticmethod
    def normalize():
        raise NotImplemented

[docs]class NZMeanNorm(NormMethod):
    name = "nzmean"

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        """Returns the normalization factors for the data, using the NZMean method.

        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.

        Returns:
            numpy array: Array with the normalization factors for the nzmean method.

        :Example:
            >>> import pytransit._tools.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> factors = norm_tools.nzmean_factors(data)
            >>> print(factors)
            array([[ 1.14836149],
                   [ 0.88558737]])

        .. seealso:: :class:`normalize_data`

        """
        (K,N) = data.shape
        total_hits = numpy.sum(data,1)
        TAs_hit = numpy.sum(data > 0, 1)
        mean_hits = total_hits/TAs_hit
        grand_total = numpy.sum(mean_hits)
        grand_mean = grand_total/float(K)
        factors = numpy.zeros((K,1))
        factors[:,0] = grand_mean/mean_hits
        data = factors * data
        return (data, factors)



[docs]class TotReadsNorm(NormMethod):
    name = "totreads"

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        """Returns the normalization factors for the data, using the total reads
        method.

        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.

        Returns:
            numpy array: Array with the normalization factors for the totreads method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> factors = norm_tools.totreads_factors(data)
            >>> print(factors)
            array([[ 1.2988762],
                   [ 0.8129396]])

        .. seealso:: :class:`normalize_data`

        """
        (K,N) = data.shape
        total_hits = numpy.sum(data,1)
        TAs = float(N)
        mean_hits = total_hits/TAs
        grand_total = numpy.sum(mean_hits)
        grand_mean = grand_total/float(K)
        factors = numpy.zeros((K,1))
        factors[:,0] = grand_mean/mean_hits
        data = factors * data
        return (data, factors)


[docs]class TTRNorm(NormMethod):
    name = "emphist"

[docs]    def empirical_theta(X):
        """Calculates the observed density of the data.

        This is used as an estimate insertion density by some normalization methods.
        May be improved by more sophisticated ways later on.

        Arguments:
            data (numpy array): (N) numpy array defining read-counts at N sites.

        Returns:
            float: Density of the given dataset.

        :Example:
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> import pytransit.norm_tools as norm_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> theta = norm_tools.empirical_theta(data)
            >>> print(theta)
            0.467133570136


        .. seealso:: :class:`TTR_factors`
        """
        return numpy.mean(X > 0)

[docs]    def trimmed_empirical_mu(X, t=0.05):
        """Estimates the trimmed mean of the data.

        This is used as an estimate of mean count by some normalization methods.
        May be improved by more sophisticated ways later on.

        Arguments:
            data (numpy array): (N) numpy array defining read-counts at N sites.
            t (float): Float specifying fraction of start and end to trim.

        Returns:
            float: (Trimmed) Mean of the given dataset.

        :Example:
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> import pytransit.norm_tools as norm_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> mu = norm_tools.trimmed_empirical_mu(data)
            >>> print(mu)
            120.73077107

        .. seealso:: :class:`TTR_factors`
        """
        return scipy.stats.trim_mean(X[X > 0], t)


[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath="", thetaEst=empirical_theta, muEst=trimmed_empirical_mu, target=100.0):
        """Returns the normalization factors for the data, using the TTR method.


        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.
            thetaEst (function): Function used to estimate density. Should take a list
                of counts as input.
            muEst (function): Function used to estimate mean count. Should take a list
                of counts as input.

        Returns:
            numpy array: Array with the normalization factors for the TTR method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> factors = norm_tools.TTR_factors(data)
            >>> print(factors)
            array([[ 1.        ],
                   [ 0.62862886]])

        .. seealso:: :class:`normalize_data`
        """
        K = len(data)
        N = len(data[0])

        factors = numpy.zeros((K,1))
        for j in range(K):
            factors[j] = float(target)/(thetaEst(data[j]) * muEst(data[j]))
        data = factors * data
        return (data, factors)


[docs]class EmpHistNorm(NormMethod):
    name = "emphist"

[docs]    @staticmethod
    def Fzinfnb(params, args):
        """Objective function for the zero-inflated NB method."""
        pi, mu, r = params
        Fdata = args
        temp0 = numpy.nan_to_num(numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata==0], mu, r)))
        tempnz = numpy.nan_to_num(numpy.log(1.0-pi)+scipy.stats.nbinom.logpmf(Fdata[Fdata>0], mu, r))
        negLL = -(numpy.sum(temp0) + numpy.sum(tempnz))
        return negLL

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        """Returns the normalized data, using the empirical hist method.

        Arguments:
            wigList (list): List of paths to wig formatted datasets.
            annotationPath (str): Path to annotation in .prot_table or GFF3 format.

        Returns:
            numpy array: Array with the normalization factors for the emphist method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> factors = norm_tools.emphist_factors(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"], "transit/genomes/H37Rv.prot_table")
            >>> print(factors)
            array([[ 1.        ],
                   [ 0.63464722]])

        .. seealso:: :class:`normalize_data`
        """
        from pytransit import tnseq_tools


        G = tnseq_tools.Genes(wigList, annotationPath)
        K = len(wigList)
        temp = []
        for j in range(K):
            reads_per_gene = []
            for gene in G:
                tempdata = numpy.array(gene.reads)
                if len(tempdata[0]) > 0:
                    reads_per_gene.append(numpy.sum(tempdata[j,:]))
            temp.append(reads_per_gene)

        temp = numpy.array(temp)

        factors = numpy.ones((K,1))
        for j in range(1, K):
            ii_good  = numpy.logical_and(temp[0,:] > 0,  temp[j,:] > 0)
            logFC = numpy.log(temp[j,ii_good]/temp[0,ii_good])
            mean = numpy.mean(logFC)
            std = numpy.sqrt(numpy.var(logFC))
            X = numpy.linspace(mean - (5*std),  mean + (std*5), 50000)
            R = scipy.stats.gaussian_kde(logFC)
            Y = R(X)
            peakLogFC = X[Y.argmax()]
            if peakLogFC < 0:
                factors[j,0] = numpy.exp(abs(peakLogFC))
            else:
                factors[j,0] = 1.0/numpy.exp(abs(peakLogFC))

        data = factors * data
        return (data, factors)


[docs]class AdaptiveBGCNorm(NormMethod):
    name = "aBGC"

[docs]    def ecdf(S, x):
        """Calculates an empirical CDF of the given data."""
        return numpy.sum(S<=x)/float(len(S))

[docs]    def cleaninfgeom(x, rho):
        """Returns a 'clean' output from the geometric distribution."""
        if x == float('inf'):
            return scipy.stats.geom.ppf(0.9999999999999999, rho)
        else:
            return x

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath="", doTotReads = True, bgsamples = 200000):
        """Returns the normalized data using the aBGC method.


        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.
            doTotReads (bool):  Boolean specifying whether to do TTR normalization as well.
            bgsamples (int): Integeer specifying how many samples to take.

        Returns:
            numpy array: Array with the normalized data.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> normdata = norm_tools.aBGC_norm(data)
            >>> print(normdata)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

        .. seealso:: :class:`normalize_data`
        """

        K,N = data.shape
        norm_data = numpy.zeros(data.shape)
        S = bgsamples
        F = [i/100.0 for i in range(0,31) if i % 2 == 0]
        BGC = []
        param_list = []
        bgc_factors = []
        for j in range(K):

            nzdata = data[j][data[j] > 0]
            nzdata.sort()
            Nall = len(data[j])
            Nnz = len(nzdata)
            GOF_list = []
            for frac in F:
                tQ = numpy.arange(0,Nnz)/float(Nnz)
                rho = 1.0/(scipy.stats.trim_mean(nzdata, frac))
                rho_to_fit = rho

                try:
                    A = (numpy.sum(numpy.power(numpy.log(1.0-tQ),2)))/(numpy.sum(nzdata*numpy.log(1.0-tQ)))
                    Kp = (2.0 * numpy.exp(A) - 1)   /(numpy.exp(A) + rho - 1)
                    temp = scipy.stats.geom.rvs(scipy.stats.beta.rvs(Kp*rho, Kp*(1-rho), size=S), size=S)
                    bgc_factors.append((rho, Kp))
                except Except as e:
                    print("aBGC Error:", str(e))
                    print("%rho=s\tKp=%s\tA=%s" % (rho, Kp, A))
                    temp = scipy.stats.geom.rvs(0.01, size=S)


                corrected_nzdata = [cleaninfgeom(scipy.stats.geom.ppf(ecdf(temp, x), rho_to_fit), rho_to_fit) for x in nzdata]
                corrected_nzmean = numpy.mean(corrected_nzdata)

                Fp = scipy.stats.geom.ppf(numpy.arange(1,Nnz+1)/float(Nnz), 1.0/corrected_nzmean)
                ii_inf = Fp == float("inf")
                Fp[ii_inf] = max(Fp[~ii_inf]) + 100
                ch2_indiv = numpy.power(corrected_nzdata- Fp, 2)/ Fp
                GOF = max(ch2_indiv)
                GOF_list.append((GOF, frac, rho_to_fit, Kp))

            gof, frac, best_rho, best_Kp = sorted(GOF_list)[0]
            BGsample = scipy.stats.geom.rvs(scipy.stats.beta.rvs(best_Kp*best_rho, best_Kp*(1-best_rho), size=S), size=S)
            #BGC.append(dict([(x, removeinf(scipy.stats.geom.ppf(ecdf(temp, x), best_rho), best_rho)) for x in data[j]]))
            for i in range(N):
                norm_data[j,i] = cleaninfgeom(scipy.stats.geom.ppf(ecdf(BGsample, data[j,i]), best_rho), best_rho)

        if doTotReads:
            (norm_data, factors) = TTRNorm.normalize(norm_data)
        return (norm_data, bgc_factors)



[docs]class ZeroInflatedNBNorm(NormMethod):
    name = "zinfb"

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        """Returns the normalization factors for the data using the zero-inflated
        negative binomial method.


        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.

        Returns:
            numpy array: Array with the normalization factors for the zinfnb method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> factors = norm_tools.zinfnb_factors(data)
            >>> print(factors)
            [[ 0.0121883 ]
             [ 0.00747111]]

        .. seealso:: :class:`normalize_data`
        """
        N = len(data)
        G = len(data[0])

        factors = numpy.zeros((N, 1))
        for j in range(N):
            initParams = [0.3, 10, 0.5]
            M = "L-BFGS-B"
            Fdata = numpy.array(data[j])
            results = scipy.optimize.minimize(Fzinfnb, initParams, args=(Fdata,), method=M, bounds=[(0.0001, 0.9999),(0.0001, None),(0.0001, 0.9999)])
            pi, n, p = results.x
            mu = n*(1-p)/p
            factors[j,0] = 1.0/mu
        data = factors * data
        return (data, factors)


[docs]class QuantileNorm(NormMethod):
    name = "quantile"

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        """Performs Quantile Normalization as described by Bolstad et al. 2003

        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.

        Returns:
            numpy array: Array with the data normalized by the quantile normalization method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> normdata = norm_tools.quantile_norm(data)
            >>> print(normdata)

        .. seealso:: :class:`normalize_data`

        """
        N = len(data)
        G = len(data[0])
        #Sort columns
        s_data = numpy.array([sorted(col) for col in data])
        #Get ranks of original data
        ranks = numpy.zeros(data.shape, dtype=int)
        for j in range(N):
            ranks[j,:] = scipy.stats.rankdata(data[j], method='dense')
        #Get empirical distribution
        ranked_means = numpy.mean(s_data,0)
        #Create dictionary of rank to new empirical values
        rank2count = dict([(r,c) for (r,c) in zip(scipy.stats.rankdata(ranked_means, method='dense'), ranked_means)])
        #Assign values
        norm_data = numpy.zeros(data.shape)
        for i in range(G):
            norm_data[:,i] = [rank2count[ranks[j,i]] for j in range(N)]
        return (norm_data, numpy.ones(1))





[docs]class BetaGeomNorm(NormMethod):
    name = "betageom"

[docs]    def ecdf(S, x):
        """Calculates an empirical CDF of the given data."""
        return numpy.sum(S<=x)/float(len(S))

[docs]    def cleaninfgeom(x, rho):
        """Returns a 'clean' output from the geometric distribution."""
        if x == float('inf'):
            return scipy.stats.geom.ppf(0.9999999999999999, rho)
        else:
            return x

[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath="", doTTR = True, bgsamples=200000):
        """Returns normalized data according to the BGC method.

        Arguments:
            data (numpy array): (K,N) numpy array defining read-counts at N sites
                for K datasets.
            doTTR (bool): Boolean specifying whether to do TTR norm as well.
            bgsamples (int): Integer specifying how many samples to take.

        Returns:
            numpy array: Array with the data normalized using the betageom method.

        :Example:
            >>> import pytransit.norm_tools as norm_tools
            >>> import pytransit.tnseq_tools as tnseq_tools
            >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
            >>> print(data)
            array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
                   [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
            >>> normdata = norm_tools.betageom_norm(data)
            >>> print(normdata)
            [[ 0.  0.  0. ...,  0.  0.  0.]
             [ 0.  0.  0. ...,  0.  0.  0.]]

        .. seealso:: :class:`normalize_data`
        """

        (K,N) = data.shape
        total_hits = numpy.sum(data,1)
        TAs_hit = numpy.sum(data > 0,1)
        mean_hits = total_hits/TAs_hit
        grand_total = numpy.sum(mean_hits)
        grand_mean = grand_total/float(K)
        norm_data = numpy.zeros(data.shape)
        bgc_factors = []
        for j in range(K):

            tQ = numpy.arange(0,N)/float(N)
            eX = numpy.array([rd for rd in data[j]])
            eX.sort()

            rho = max(1.0/scipy.stats.trim_mean(eX+1, 0.001), 0.0001)
            A = (numpy.sum(numpy.power(numpy.log(1.0-tQ),2)))/(numpy.sum(eX*numpy.log(1.0-tQ)))
            Kp = max((2.0 * numpy.exp(A) - 1)   /(numpy.exp(A) + rho - 1), 10)

            bgc_factors.append((rho,Kp))
            try:
                BGsample = scipy.stats.geom.rvs(scipy.stats.beta.rvs(Kp*rho, Kp*(1-rho), size=bgsamples), size=bgsamples)
            except Exception as e:
                print("BGC ERROR with rho=%f, Kp=%f, A=%s" % (rho, Kp, A))
                print(str(e))
                BGsample = scipy.stats.geom.rvs(rho, size=bgsamples)

            for i in range(N):
                norm_data[j,i] = cleaninfgeom(scipy.stats.geom.ppf(ecdf(BGsample, data[j,i]), 1.0/grand_mean), 1.0/grand_mean)

        if doTTR:
            (norm_data, factors) = TTRNorm.normalize(norm_data)
        return (norm_data, bgc_factors)


[docs]class NoNorm(NormMethod):
    name = "nonorm"
[docs]    @staticmethod
    def normalize(data, wigList=[], annotationPath=""):
        return (data, numpy.ones(1))


methods = {}
methods["nonorm"] = NoNorm
methods["TTR"] = TTRNorm
methods["nzmean"] = NZMeanNorm
methods["totreads"] = TotReadsNorm
methods["betageom"] = BetaGeomNorm
methods["zinfnb"] = ZeroInflatedNBNorm
methods["quantile"] = QuantileNorm
methods["aBGC"] = AdaptiveBGCNorm
methods["emphist"] = EmpHistNorm


#########################
[docs]def normalize_data(data, method="nonorm", wigList=[], annotationPath=""):
    """Normalizes the numpy array by the given normalization method.

    Arguments:
        data (numpy array): (K,N) numpy array defining read-counts at N sites
            for K datasets.
        method (str): Name of the desired normalization method.
        wigList (list): List of paths for the desired wig-formatted datasets.
        annotationPath (str): Path to the prot_table annotation file.

    Returns:
        numpy array: Array with the normalized data.
        list: List containing the normalization factors. Empty if not used.

    :Example:
        >>> import pytransit.norm_tools as norm_tools
        >>> import pytransit.tnseq_tools as tnseq_tools
        >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
        >>> print(data)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        (normdata, normfactors) = norm_tools.normalize_data(data, "TTR")   # Some methods require annotation and path to wig files.
        >>> print(normfactors)
        array([[ 1.        ],
               [ 0.62862886]])
        >> print(normdata)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

    .. note:: Some normalization methods require the wigList and annotationPath arguments.

    """
    factors = []
    if method in methods:
        return methods[method].normalize(data, wigList, annotationPath)
    else:
        warnstr = "Normalization method '%s' is unknown. Read-counts were not normalized." % (method)
        warnings.warn(warnstr)
    return methods["nonorm"].normalize(data, wigList, annotationPath)


[docs]def empirical_theta(X):
    """Calculates the observed density of the data.

    This is used as an estimate insertion density by some normalization methods.
    May be improved by more sophisticated ways later on.

    Arguments:
        data (numpy array): (N) numpy array defining read-counts at N sites.

    Returns:
        float: Density of the given dataset.

    :Example:
        >>> import pytransit.tnseq_tools as tnseq_tools
        >>> import pytransit.norm_tools as norm_tools
        >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
        >>> print(data)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        >>> theta = norm_tools.empirical_theta(data)
        >>> print(theta)
        0.467133570136


    .. seealso:: :class:`TTR_factors`
    """
    return numpy.mean(X > 0)

[docs]def trimmed_empirical_mu(X, t=0.05):
    """Estimates the trimmed mean of the data.

    This is used as an estimate of mean count by some normalization methods.
    May be improved by more sophisticated ways later on.

    Arguments:
        data (numpy array): (N) numpy array defining read-counts at N sites.
        t (float): Float specifying fraction of start and end to trim.

    Returns:
        float: (Trimmed) Mean of the given dataset.

    :Example:
        >>> import pytransit.tnseq_tools as tnseq_tools
        >>> import pytransit.norm_tools as norm_tools
        >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
        >>> print(data)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        >>> mu = norm_tools.trimmed_empirical_mu(data)
        >>> print(mu)
        120.73077107

    .. seealso:: :class:`TTR_factors`
    """

    return scipy.stats.trim_mean(X[X > 0], t)


[docs]def Fzinfnb(params, args):
    """Objective function for the zero-inflated NB method."""
    pi, mu, r = params
    Fdata = args
    temp0 = numpy.nan_to_num(numpy.log(pi + scipy.stats.nbinom.pmf(Fdata[Fdata==0], mu, r)))
    tempnz = numpy.nan_to_num(numpy.log(1.0-pi)+scipy.stats.nbinom.logpmf(Fdata[Fdata>0], mu, r))
    negLL = -(numpy.sum(temp0) + numpy.sum(tempnz))
    return negLL


[docs]def zinfnb_factors(data):
    """Returns the normalization factors for the data using the zero-inflated
    negative binomial method.


    Arguments:
        data (numpy array): (K,N) numpy array defining read-counts at N sites
            for K datasets.

    Returns:
        numpy array: Array with the normalization factors for the zinfnb method.

    :Example:
        >>> import pytransit.norm_tools as norm_tools
        >>> import pytransit.tnseq_tools as tnseq_tools
        >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
        >>> print(data)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        >>> factors = norm_tools.zinfnb_factors(data)
        >>> print(factors)
        [[ 0.0121883 ]
         [ 0.00747111]]

    .. seealso:: :class:`normalize_data`
    """
    N = len(data)
    G = len(data[0])

    factors = numpy.zeros((N, 1))
    for j in range(N):
        initParams = [0.3, 10, 0.5]
        M = "L-BFGS-B"
        Fdata = numpy.array(data[j])
        results = scipy.optimize.minimize(Fzinfnb, initParams, args=(Fdata,), method=M, bounds=[(0.0001, 0.9999),(0.0001, None),(0.0001, 0.9999)])
        pi, n, p = results.x
        mu = n*(1-p)/p
        factors[j,0] = 1.0/mu
    return numpy.array(factors)

#

[docs]def ecdf(S, x):
    """Calculates an empirical CDF of the given data."""
    return numpy.sum(S<=x)/float(len(S))


[docs]def cleaninfgeom(x, rho):
    """Returns a 'clean' output from the geometric distribution."""
    if x == float('inf'):
        return scipy.stats.geom.ppf(0.9999999999999999, rho)
    else:
        return x

#

[docs]def norm_to_target(data, target):
    """Returns factors to normalize the data to the given target value.

    Arguments:
        data (numpy array): (K,N) numpy array defining read-counts at N sites
            for K datasets.
        target (float): Floating point specifying the target for the mean of the data/

    Returns:
        numpy array: Array with the factors necessary to normalize mean to target.

    :Example:
        >>> import pytransit.norm_tools as norm_tools
        >>> import pytransit.tnseq_tools as tnseq_tools
        >>> (data, position) = tnseq_tools.get_data(["transit/data/glycerol_H37Rv_rep1.wig", "transit/data/glycerol_H37Rv_rep2.wig"])
        >>> print(data)
        array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        >>> factors = norm_tools.norm_to_target(data, 100)
        >>> print(factors)
        [[ 1.8548104 ]
         [ 1.16088726]]


    .. seealso:: :class:`normalize_data`
    """
    (K,N) = data.shape
    factors = numpy.zeros((K,1))
    factors[:,0] = float(target)/numpy.mean(data,1)
    return factors