Source code for LIMBR.old_fashioned

import numpy as np
import pandas as pd
import os
import time
import scipy.stats as stats
from numpy.linalg import svd, lstsq
from sklearn.decomposition import PCA
from scipy.stats import linregress, f_oneway
import itertools
import sys
from statsmodels.nonparametric.smoothers_lowess import lowess
from tqdm import tqdm
from sklearn.preprocessing import scale
from sklearn.neighbors import NearestNeighbors
import math
import json
from ctypes import c_int
import pickle
from multiprocess import Pool, current_process, Manager
from functools import partial
from sklearn import preprocessing

[docs]class old_fashioned: """ Performs a standard normalization procedure without SVD as a baseline. This class performs simple quantile normalization and row scaling along with pool normalization for proteomics experiments using the same methods and interface employed in the sva class. This provides a baseline comparison point for data processed with LIMBR. Parameters ---------- filename : str Path to the input dataset. data_type : str Type of dataset, one of 'p' or 'r'. 'p' indicates proteomic with two index columns specifying peptide and protein. 'r' indicates RNAseq with one index column indicating gene. pool : str Path to file containing pooled control design for experiment in the case of data_type = 'p'. This should be a pickled dictionary with the keys being column headers corresponding to each sample and the values being the corresponding pooled control number. Attributes ---------- raw_data : dataframe This is where the input data is stored. data_type : str This is where the data type ('p' or 'r') is stored. norm_map : dict This is where the assignment of pooled controls to samples are stored if data_type = 'p'. """ def __init__(self, filename,data_type,pool=None): """ Imports data and initializes an old_fashioned object. Takes a file from one of two data types protein ('p') which has two index columns or rna ('r') which has only one. Opens a pickled file matching pooled controls to corresponding samples if data_type = 'p'. """ np.random.seed(4574) self.data_type = str(data_type) if self.data_type == 'p': self.raw_data = pd.read_csv(filename,sep='\t').set_index(['Peptide','Protein']) if self.data_type == 'r': self.raw_data = pd.read_csv(filename,sep='\t').set_index('#') if pool != None: self.norm_map = pickle.load( open( pool, "rb" ) ) self.notdone = True
[docs] def pool_normalize(self): """ Preprocessing normalization. Performs pool normalization on an sva object using the raw_data and norm_map if pooled controls were used. Quantile normalization of each column and scaling of each row are then performed. Attributes ---------- scaler : sklearn.preprocessing.StandardScaler() A fitted scaler from the sklearn preprocessing module. data_pnorm : dataframe Pool normalized data. """ def pool_norm(df,dmap): """ Pool normalizes samples in a proteomics experiment. Peptide abundances of each sample are divided by corresponding pooled control abundances. Parameters ---------- df : dataframe The dataframe to be pool normalized. dmap : dict The dictionary connecting each sample to its corresponding pooled control. Returns ------- newdf : dataframe Dataframe with samples pool normalized and pooled control columns dropped. """ newdf = pd.DataFrame(index=df.index) for column in df.columns.values: if 'pool' not in column: newdf[column] = df[column].div(df['pool_'+'%02d' % dmap[column]],axis='index') nonpool = [i for i in newdf.columns if 'pool' not in i] newdf = newdf[nonpool] return newdf def qnorm(df): """ Quantile normalizes data by columns. A reference distribution is generated as the mean across rows of the dataset with all columns sorted by abundance. Each column is then quantile normalized to this target distribution. Parameters ---------- df : dataframe The dataframe to be quantile normalized Returns ------- newdf : dataframe The quantile normalized dataframe. """ ref = pd.concat([df[col].sort_values().reset_index(drop=True) for col in df], axis=1, ignore_index=True).mean(axis=1).values for i in range(0,len(df.columns)): df = df.sort_values(df.columns[i]) df[df.columns[i]] = ref return df.sort_index() if self.data_type == 'r': self.data = qnorm(self.raw_data) self.scaler = preprocessing.StandardScaler().fit(self.data.values.T) self.data = pd.DataFrame(self.scaler.transform(self.data.values.T).T,columns=self.data.columns,index=self.data.index) else: self.data_pnorm = pool_norm(self.raw_data,self.norm_map) self.data_pnorm = self.data_pnorm.replace([np.inf, -np.inf], np.nan) self.data_pnorm = self.data_pnorm.dropna() self.data_pnorm = self.data_pnorm.sort_index(axis=1) self.data_pnorm = qnorm(self.data_pnorm) self.scaler = preprocessing.StandardScaler().fit(self.data_pnorm.values.T) self.data = pd.DataFrame(self.scaler.transform(self.data_pnorm.values.T).T,columns=self.data_pnorm.columns,index=self.data_pnorm.index)
[docs] def normalize(self,outname): """ Groups peptides by protein and outputs final processed dataset. These final results are then written to an output file. Parameters ---------- outname : str Path to desired output file. """ #self.old_norm = self.scaler.inverse_transform(self.data.values.T).T #self.old_norm = pd.DataFrame(self.old_norm,index=self.data.index,columns=self.data.columns) if self.data_type == 'p': self.data = self.data.groupby(level='Protein').mean() self.data.index.names = ['#'] self.data.to_csv(outname,sep='\t')