add tcga preprocessing

Alice LACAN
Commit 9b02626b4e3c0f065f87c27610e33bb8b6755941 9b02626b 1 parent 6937b224
Showing 5 changed files with 349 additions and 0 deletions
data/README.md
data/preprocessing.py
data/rtcga_rnaseq.R
data/tcga_dataloader.py
data/utils_preprocessing.py
--- a/data/README.md
View file @9b02626
+++ b/data/README.md
View file @9b02626
@@ -28,3 +28,93 @@ Arguments:
 
 # TCGA data
 
+ The gene expression data and clinical data is retrieved from the Cancer Genome Analysis dataset, using the `RTCGA` package.
+ To retrieve the raw data (release data of 2015-11-01), run the following script: 
+ 
+ >`rtcga_rnaseq.R`
+ 
+ All cohort files should be contained in the "tcga_files" folder. 
+ 
+ ## Preprocessing
+ To preprocess the data and generate the full csv file, run the following only once (in Python):
+ 
+ > `Processor = RNADataProcessor(_type_='data', cancer_type='all', dataset='tcga')`
+ > `Processor.preprocessing()`
+ 
+ This should be ran only once to generate the full expression dataset and the separate clinical data files.
+ Then, to build and load the full dataset with both gene expression and clinical data, run the following:
+ 
+ > `df = Loader.build_full_dataframe('/tcga_files')`
+ 
+ > `df.to_csv('/tcga_files/TCGA_rnaseq_RSEM_full_dataset.csv', index=False)`
+ 
+ Finally, to build and load the final preprocessed dataset with only the conditionning clinical variables of interest, run the following:
+ > `df_final = Loader.process_all_covariates('/tcga_files/TCGA_rnaseq_RSEM_full_dataset.csv')`
+ 
+ ## Train-test split
+ To split (80%-20%) the full preprocessed TCGA dataset (including both gene expression and clinical data), run the following:
+ 
+ > `from utils_preprocessing import train_test_split`
+ 
+ > `df_train, df_test = train_test_split(df_final, split_ratios=[0.8, 0.2], shuffle=True)`
+ 
+ > `df_train.to_csv('/tcga_files/train_df_covariates.csv', index=False)`
+ 
+ > `df_test.to_csv('/tcga_files/test_df_covariates.csv', index=False)`
+ 
+ ## Clinical data 
+ Different functions can be used to explore the data.
+ For instance: to take a look at the clinical data used in the conditionning of GANs, run the following:
+ > `df = Loader.fetch_covariates(cohort='BRCA')`
+ 
+ This will return a pandas dataframe with all the covariate variables: age, gender, cancer labels, tissue types, cancer types.
+ If `cohort='all'`, then the clinical variables for all cohorts will be returned in the dataframe.
+ 
+ Here are all the different cancer cohorts:
+ 
+ "ACC", "BLCA", "BRCA","CESC", "CHOL",
+ "COAD", "DLBC", "ESCA", "GBM", "HNSC",
+ "KICH", "KIRC", "KIRP", "LAML", "LGG",
+ "LIHC", "LUAD", "LUSC", "MESO", "OV",
+ "PAAD", "PCPG", "PRAD", "READ", "SARC",
+ "SKCM", "STAD", "STES", "TGCT", "THCA",
+ "THYM", "UCEC", "UCS", "UVM"
+ 
+ Here are all the cancer types:
+ 
+ 'adrenocortical carcinoma', 
+ 'bladder urothelial carcinoma',
+ 'breast invasive carcinoma', 
+ 'cervical squamous cell carcinoma endocervical adenocarcinoma', 
+ 'cholangiocarcinoma', 
+ 'colon adenocarcinoma', 
+ 'lymphoid neoplasm diffuse large b-cell lymphoma', 
+ 'esophageal carcinoma', 
+ 'glioblastoma multiforme', 
+ 'head neck squamous cell carcinoma',
+ 'kidney chromophobe', 
+ 'kidney renal clear cell carcinoma', 
+ 'kidney renal papillary cell carcinoma', 
+ 'acute myeloid leukemia', 
+ 'brain lower grade glioma', 
+ 'liver hepatocellular carcinoma', 
+ 'lung adenocarcinoma', 
+ 'lung squamous cell carcinoma',
+ 'mesothelioma', 
+ 'ovarian serous cystadenocarcinoma', 
+ 'pancreatic adenocarcinoma', 
+ 'pheochromocytoma and paraganglioma', 
+ 'prostate adenocarcinoma', 
+ 'rectum adenocarcinoma', 
+ 'sarcoma', 
+ 'skin cutaneous melanoma', 
+ 'stomach adenocarcinoma',
+ 'stomach esophageal carcinoma', 
+ 'testicular germ cell tumors', 
+ 'thyroid carcinoma', 
+ 'thymoma', 
+ 'uterine corpus endometrial carcinoma',
+ 'uterine carcinosarcoma', 
+ 'uveal melanoma'
+ 
+ There are **93.122 % of cancer samples** and **6.878 % of non cancerous samples** in the full dataset.
\ No newline at end of file
--- a/data/preprocessing.py 0 → 100644
View file @9b02626
+++ b/data/preprocessing.py 0 → 100644
View file @9b02626
+ """
+ Get clinical data or RNAseq from TCGA datasets obtained with RTCGA package on R.
+ 
+ """
+ 
+ # Imports
+ import os
+ import pandas as pd
+ import numpy as np
+ import warnings
+ warnings.filterwarnings('ignore') # Ignore warnings of pandas
+ from time import process_time
+ import random
+ random.seed(1)
+ from os.path import expanduser
+ USER_HOME = expanduser("~")
+ 
+ 
+ class RNADataProcessor():
+     def __init__(self, _type_='clinical', cancer_type='all', dataset='tcga'):
+         """ Init RNAseqDATA class with data type 'clinical' or 'data' for gene expressions and cancer type with 'all' by default
+         ----------
+         _type_ (str): data type either 'clinical' or RNAseq 'data'
+         cancer_type (str): 'all' cancer types is the only option for now
+         dataset (str): default is tcga, else 'microarrays'
+ 
+         """
+         assert _type_.lower() in ['clinical', 'data']
+         assert cancer_type.lower() in ['all']
+         assert dataset.lower() in ['tcga']
+ 
+ 
+         self.type = _type_
+         self.cancer_type = cancer_type
+         self.dataset= dataset.lower()
+ 
+         if dataset.lower() =='tcga':
+             if not os.path.exists('./tcga_files'):
+                 print("Loader could not find 'tcga_files' folder..")
+             else:
+                 print(f"Loader ready.")
+ 
+             self.PATH = './tcga_files'
+             self.PATHFOLDER = '/gdac.broadinstitute.org_{}.Mer'
+             self.PATHFILE = '/{}.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
+             self.PATH_CLINICALFOLDER = '/clinical/gdac.broadinstitute.org_{}.Merge_Clinical.Level_1.2016012800.0.0'
+             self.PATH_CLINICALFILE = '/{}.clin.merged.txt'
+             self.COHORTS = ["ACC","BLCA","BRCA","CESC","CHOL","COAD","DLBC","ESCA","GBM", "HNSC","KICH","KIRC","KIRP",
+ "LAML","LGG","LIHC","LUAD","LUSC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","STES","TGCT","THCA","THYM","UCEC","UCS","UVM"] #COADREAD, KIPAN and GBMLGG REMOVED
+ 
+     def preprocessing(self):
+         """Preprocessing according to dataset type
+         ---"""
+ 
+         if self.dataset=='tcga':
+             self.preprocessing_tcga()
+ 
+ 
+     @staticmethod
+     def standardize(x, mean=None, std=None):
+         """
+         Shape x: (nb_samples, nb_vars)
+         """
+         if mean is None:
+             mean = np.mean(x, axis=0)
+         if std is None:
+             std = np.std(x, axis=0)
+         return (x - mean) / std
+ 
+ 
+     def preprocessing_tcga(self):
+         """Preprocessing of data either clinical or rnaseq for TCGA data
+         ---------------------- """
+ 
+         """Loading individual  dataset"""
+         if self.type=='clinical':
+             #Function to read CSV
+             clin_UVM = pd.read_csv(self.PATH+self.PATH_CLINICALFOLDER.format('UVM')+self.PATH_CLINICALFILE.format('UVM'), sep='\t', header=None)
+ 
+         elif self.type=='data' and self.cancer_type =='all':
+ 
+             t1 = process_time()
+ 
+             for COHORT in self.COHORTS:
+ 
+                 #Load cohort dataframe
+                 try:
+                     df_temp = pd.read_csv(self.PATH+self.PATHFOLDER.format(COHORT)+self.PATHFILE.format(COHORT), sep='\s+')
+                 except FileNotFoundError:
+                     df_temp = pd.read_csv(self.PATH+self.PATHFOLDER.format(COHORT)+'g'+self.PATHFILE.format(COHORT), sep='\s+')
+ 
+                 print(f'[Callback]| Starting preprocessing for {COHORT}.')
+ 
+                 """Preprocessing of data"""
+                 PATIENTS_IDs = list(df_temp.columns[2:])
+                 #Get tuples of genes ids hybridization
+                 GENE_IDs = np.asarray([['','']]+[x.split('|') for x in list(df_temp['Hybridization'])[1:]]) #Same order check
+                 #New df
+                 df = df_temp[['REF']+PATIENTS_IDs]
+                 #Old columns misplaced
+                 OLD_COLS = list(df.columns)
+                 #Drop last column with NaN
+                 #print(OLD_COLS[-1]) #to drop
+                 df.drop(OLD_COLS[-1], inplace=True, axis=1)
+                 #Rename columns
+                 df.rename(columns={x:y for x,y in zip(OLD_COLS[:-1], PATIENTS_IDs )}, inplace=True)
+                 #Add columns with gene IDs
+                 df.insert(0, 'Hybridization' ,GENE_IDs[:,0])
+                 df.insert(1, 'Gene_id', GENE_IDs[:,1])
+                 #Drop string row of 'normalized count'
+                 df.drop(0,axis=0,inplace=True)
+ 
+                 # Save gene ids once
+                 if COHORT=='ACC':
+                     np.save(self.PATH+'/TCGA_rnaseq_RSEM_gene_ids_full.npy', GENE_IDs[:,1])
+                     np.save(self.PATH+'/TCGA_rnaseq_RSEM_hybridization_full.npy', GENE_IDs[:,0])
+ 
+                 #Get labels according to bar code: 4th element in barcode is a number+letter like '01A', if number <10 then cancer, else: not cancer
+                 LABELS = [1 if int(patient_id.split('-')[3][0:2]) < 10 else 0 for patient_id in PATIENTS_IDs]
+                 #print(LABELS)
+ 
+                 #Get data
+                 #convert to floats
+                 df[PATIENTS_IDs] =df[PATIENTS_IDs].apply(pd.to_numeric)
+                 SAMPLES = df[PATIENTS_IDs].to_numpy().transpose() #We transpose to have 1 row per patient
+ 
+                 #Remove nans
+                 SAMPLES[np.isnan(SAMPLES)] = 0.0
+ 
+                 #Keep only cancerous tissues for multiclass classification (patients with disease)
+                 DIS_SAMPLES = SAMPLES[np.asarray(LABELS)==1]
+ 
+ 
+                 if COHORT != 'ACC':
+                     #We don't keep cancer type for non cancerous tissues
+                     cancer_types_multiclass.extend(len(np.asarray(LABELS)[np.asarray(LABELS)==1])*[COHORT])
+                     cancer_types.extend(len(np.asarray(LABELS))*[COHORT])
+                     samples = np.append(samples, SAMPLES, axis=0)
+                     dis_samples = np.append(dis_samples, DIS_SAMPLES, axis=0)
+                     labels = np.append(labels, np.asarray(LABELS), axis=0)
+                     patients_ids = np.append(patients_ids, PATIENTS_IDs, axis=0)
+ 
+                 elif COHORT =='ACC':
+                     samples=SAMPLES
+                     dis_samples = DIS_SAMPLES
+                     labels = LABELS
+                     patients_ids = PATIENTS_IDs
+                     #Save cancer type for multiclass labels of cancerous tissues only
+                     cancer_types_multiclass = len(np.asarray(LABELS)[np.asarray(LABELS)==1])*[COHORT]
+                     cancer_types = len(np.asarray(LABELS))*[COHORT]
+ 
+ 
+             print(f"Preprocessing on TCGA data is complete.\n---> Saving {samples.shape[0]} RSEM RNAseq samples...")
+             #Save data in local folder
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_preprocess_full.npy', samples)
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_preprocess_multiclass_full.npy', dis_samples)
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_labels_full.npy', labels)
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_patients_ids_full.npy', patients_ids)
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_cancer_types_full.npy', cancer_types)
+             np.save(self.PATH+'/TCGA_rnaseq_RSEM_cancer_types_multiclass_full.npy', cancer_types_multiclass)
+ 
+             t_end = process_time()
+             print(f'[Callback]| Preprocessing took {round(t_end - t1, 5)} seconds to run.')
--- a/data/rtcga_rnaseq.R 0 → 100644
View file @9b02626
+++ b/data/rtcga_rnaseq.R 0 → 100644
View file @9b02626
+ ### Download rna seq data with RTCGA package
+ 
+ # Load package
+ library("RTCGA")
+ 
+ # Check HTML
+ # browseVignettes("RTCGA")
+ 
+ # Check available data times. According to docu, Version 20151101 of RTCA.seq contains RNAseq datasets released 2015-11-01.
+ checkTCGA("Dates")
+ 
+ # Check cohorts i.e cancer types
+ # This run needs the loading od package 'dplyr' first for function %>%.
+ (cohorts <- infoTCGA() %>%
+   rownames() %>%
+   sub("-counts", "", x=.))
+ 
+ 
+ # Downloading RNAseq files for all cohorts
+ 
+ # Create directory
+ dir.create("data_RNAseq_RTCGA")
+ 
+ releaseDate <- "2015-11-01"
+ sapply(cohorts, function(element){
+   tryCatch({
+     downloadTCGA(cancerTypes = element,
+                  dataSet = 'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level',
+                  destDir = "data_RNAseq_RTCGA",
+                  date = releaseDate)},
+     error = function(cond){
+       cat("Error: Maybe there weren't rnaseq data for ", element, " cancer .\n")
+     }
+     )
+ })
+ 
+ 
+ # Shortening paths and directories
+ list.files("data_RNAseq_RTCGA") %>%
+   file.path("data_RNAseq_RTCGA", .) %>%
+   file.rename(to = substr(.,start=1, stop=50))
+ 
+ 
+ 
+ # Remove NA files
+ list.files("data_RNAseq_RTCGA") %>%
+   file.path("data_RNAseq_RTCGA", .) %>%
+   sapply(function(x){
+     if (x=="data_RNAseq_RTCGA/NA")
+       file.remove(x)
+   })
+ 
+ 
+ # Remove unneeded "MANIFEST.txt" file from each cohort folder
+ list.files("data_RNAseq_RTCGA") %>%
+   file.path("data_RNAseq_RTCGA", .) %>%
+   sapply(function(x){
+     file.path(x, list.files(x)) %>%
+       grep(pattern = 'MANIFEST.txt', x=., value=TRUE) %>%
+       file.remove()
+   })
+ 
+ 
+ # Assign paths to files downloaded
+ 
+ list.files("data_RNAseq_RTCGA") %>%
+   file.path("data_RNAseq_RTCGA", .) %>%
+   sapply(function(y){
+     file.path(y, list.files(y)) %>%
+       assign(value = .,
+              x = paste0(list.files(y) %>%
+                         gsub(x=.,
+                              pattern="\\..*",
+                              replacement="") %>%
+                           gsub(x=., pattern="-", replacement = "_"),
+                         ".rnaseq.path"),
+              envir=.GlobalEnv)
+   })
+ 
+ # Reading data with special function readTCGA to read and transpose data automatically
+ ls() %>%
+   grep("rnaseq\\.path", x=., value=TRUE) %>%
+   sapply(function(element){
+     tryCatch({
+       readTCGA(get(element, envir=.GlobalEnv),
+                dataType = "rnaseq") %>%
+         assign(value = .,
+                x=sub("\\.path", "", x=element),
+                envir=.GlobalEnv)
+     }, error = function(cond){
+       cat(element)
+     })
+     invisible(NULL)
+   }
+   )
+ 
--- a/data/tcga_dataloader.py 0 → 100644
View file @9b02626
+++ b/data/tcga_dataloader.py 0 → 100644
View file @9b02626
--- a/data/utils_preprocessing.py 0 → 100644
View file @9b02626
+++ b/data/utils_preprocessing.py 0 → 100644
View file @9b02626