Analysis Pipeline

Sequencing data processing and quality-control filtering

Paired-end sequenced reads were assembled using FLASH with default parameters. Only assembled reads with a length > 400 bases were retained for analysis. Sequencing adaptors were removed using cutadapt. Reads were trimmed using the FASTX-Toolkit. Finally, assembled reads with a quality score < 28 in more than 95% of the sequence were discarded.

Here are the scripts used for FLASH assembly.

rm(list=ls(all=TRUE))

setwd("/var/MICROBIOTE")

inputdir = "./01_reads.raw/"
ouputdir = "./02_reads.assembled/"
flashcmd = "./00_flash/FLASH-1.2.11/flash"

files = dir(inputdir, pattern="*.1.fastq")
files = unique(gsub("_1.fastq","",files))

for(file in files){

	file1 = basename(paste0(file,"_1.fastq"))
	file2 = basename(paste0(file,"_2.fastq"))
		
	cmd = paste0(flashcmd," ",inputdir,file1," ",inputdir,file2," --output-prefix=",file," --output-directory=",ouputdir)
	
	print(cmd)
	system(cmd)
	
}
Here are the scripts used for read labeling.

rm(list=ls(all=TRUE))

setwd("/var/MICROBIOTE")

inputdir  = "./02_reads.assembled/"
outputdir = "./03_reads.labeled/"

files     = dir(inputdir,pattern="*.extendedFrags.fastq",full.names=TRUE)

for(file in files){

	print(file)
	
	filename  <- basename(file)
	filename  <- gsub(".extendedFrags.fastq","",filename)
	
	seqtag    <- gsub("_","-",filename)
	
	lines     <- readLines(file)
	lines     <- gsub(" ","-",lines)
	lines     <- gsub("@M01626",paste0("@",seqtag,"_M01626"),lines)
	
	write(lines,paste0(outputdir,filename,".fastq"))
		
}
Here are the scripts used for assembled read filtering, sequencing adaptors removing, read trimming, and fasta conversion.

rm(list=ls(all=TRUE))

setwd("/var/MICROBIOTE")

extended_dir          = "./03_reads.labeled/"
length_filtered_dir   = "./04_reads.length_filtered/"
adapt_filtered_dir    = "./05_reads.adapt_filtered/"
trimmed_filtered_dir  = "./06_reads.trimmed_filtered/"
qual_filtered_dir     = "./07_reads.qual_filtered/"
fasta_filtered_dir    = "./08_reads.fasta/"

fastx_trimmer_cmd    = "./00_fastx/bin/fastx_trimmer"
fastq_quality_cmd    = "./00_fastx/bin/fastq_quality_filter"
fastq_to_fasta_cmd   = "./00_fastx/bin/fastq_to_fasta"

files                = dir(extended_dir,pattern="*.fastq")

min_length           = 400
f                    = 20
min_q                = 28
min_p                = 95

for(file in files){

	input_file  = file
	output_file = paste0(length_filtered_dir,gsub(".extendedFrags","",basename(input_file)))
	cmd         = paste0("cat ",extended_dir,input_file," | paste - - - - | awk 'length($2) >= ",min_length,"' | sed 's/\\t/\\n/g' > ",output_file)
	print(cmd)
	system(cmd)

	input_file  = output_file
	output_file = paste0(adapt_filtered_dir,basename(input_file))	
	cmd         = paste0("cutadapt -b TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG -b GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAATCC -o ",output_file, " ",input_file)
	print(cmd)
	system(cmd)

	input_file  = output_file
	output_file = paste0(trimmed_filtered_dir,basename(input_file))	
	cmd         = paste0(fastx_trimmer_cmd, " -Q33 -f ",f," -i " ,input_file," -o ",output_file)
	print(cmd)
	system(cmd)

	input_file  = output_file
	output_file = paste0(qual_filtered_dir,basename(input_file))
	cmd         = paste0(fastq_quality_cmd," -Q33 -q ",min_q," -p ",min_p," -i " ,input_file," -o ",output_file)
	print(cmd)
	system(cmd)
	
	input_file  = output_file
	output_file = paste0(fasta_filtered_dir,gsub("fastq","fasta",basename(input_file)))	
	cmd         = paste0(fastq_to_fasta_cmd," -Q33 -i " ,input_file," -o ",output_file)
	print(cmd)
	system(cmd)

}
Here are the scripts used for read pooling.

#!/bin/sh
cd /var/MICROBIOTE/

cat ./08_reads.fasta/REC*CBL015* > ./09_reads.pooled/REC-CBL015.fasta
cat ./08_reads.fasta/REC*CCA096* > ./09_reads.pooled/REC-CCA096.fasta
cat ./08_reads.fasta/REC*BA890I* > ./09_reads.pooled/REC-BA890I.fasta
cat ./08_reads.fasta/REC*CA086* > ./09_reads.pooled/REC-CA086.fasta
cat ./08_reads.fasta/REC*CB804C* > ./09_reads.pooled/REC-CB804C.fasta

cat ./08_reads.fasta/VAG*CBL015* > ./09_reads.pooled/VAG-CBL015.fasta
cat ./08_reads.fasta/VAG*CCA096* > ./09_reads.pooled/VAG-CCA096.fasta
cat ./08_reads.fasta/VAG*BA890I* > ./09_reads.pooled/VAG-BA890I.fasta
cat ./08_reads.fasta/VAG*CA086* > ./09_reads.pooled/VAG-CA086.fasta
cat ./08_reads.fasta/VAG*CB804C* > ./09_reads.pooled/VAG-CB804C.fasta

OTU identification, taxonomic assignment, and statistical analyses

Microbiota profiles were analyzed using QIIME. Rectal and vaginal microbiota analyses were performed separately and independently for each animal. OTU picking was performed using the uclust algorithm with identity set at 97%. Representative sets of sequences were aligned with the PyNAST algorithm. Taxonomic assignments were performed using the RDP classifier trained on the Greengenes database. Taxon with abundance < 1% in all animals were filtered out and aggregated to provide a complete view of the rectal and vaginal compartments. Alpha diversity was calculated based on the Simpson diversity index. Statistical analyses to identify associations between progesterone level and taxon relative abundances were performed using the MetagenomeSeq’s fitZIG algorithm. Species level assignments for Lactobacillus associated reads was performed using BLAST. Jensen-Shannon divergences were computed using the philentropy R package.

Here are the scripts used for OTU identification.

#!/bin/sh
conda activate qiime1
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/REC-BA890I.fasta -o /var/MICROBIOTE/10_otu/REC-BA890I/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/REC-CA086.fasta -o /var/MICROBIOTE/10_otu/REC-CA086/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/REC-CB804C.fasta -o /var/MICROBIOTE/10_otu/REC-CB804C/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/REC-CBL015.fasta -o /var/MICROBIOTE/10_otu/REC-CBL015/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/REC-CCA096.fasta -o /var/MICROBIOTE/10_otu/REC-CCA096/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/VAG-BA890I.fasta -o /var/MICROBIOTE/10_otu/VAG-BA890I/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/VAG-CA086.fasta -o /var/MICROBIOTE/10_otu/VAG-CA086/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/VAG-CB804C.fasta -o /var/MICROBIOTE/10_otu/VAG-CB804C/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/VAG-CBL015.fasta -o /var/MICROBIOTE/10_otu/VAG-CBL015/
pick_de_novo_otus.py -f -a --jobs_to_start 40 -i /var/MICROBIOTE/09_reads.pooled/VAG-CCA096.fasta -o /var/MICROBIOTE/10_otu/VAG-CCA096/

Here are the scripts used for OTU filtering.

#!/bin/sh
conda activate qiime1
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/REC-BA890I/otu_table.biom -o /var/MICROBIOTE/10_otu/REC-BA890I/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/REC-CA086/otu_table.biom -o /var/MICROBIOTE/10_otu/REC-CA086/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/REC-CB804C/otu_table.biom -o /var/MICROBIOTE/10_otu/REC-CB804C/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/REC-CBL015/otu_table.biom -o /var/MICROBIOTE/10_otu/REC-CBL015/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/REC-CCA096/otu_table.biom -o /var/MICROBIOTE/10_otu/REC-CCA096/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/VAG-BA890I/otu_table.biom -o /var/MICROBIOTE/10_otu/VAG-BA890I/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/VAG-CA086/otu_table.biom -o /var/MICROBIOTE/10_otu/VAG-CA086/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/VAG-CB804C/otu_table.biom -o /var/MICROBIOTE/10_otu/VAG-CB804C/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/VAG-CBL015/otu_table.biom -o /var/MICROBIOTE/10_otu/VAG-CBL015/otu_table_f.biom --min_count_fraction 0.01
filter_otus_from_otu_table.py -i /var/MICROBIOTE/10_otu/VAG-CCA096/otu_table.biom -o /var/MICROBIOTE/10_otu/VAG-CCA096/otu_table_f.biom --min_count_fraction 0.01
Here are the scripts used for extracting percentages.

rm(list=ls(all=TRUE))

setwd("/var/MICROBIOTE")

taxa_summary = "./11_taxa.summary/"
percentage_matrices = "./12_percentage.matrices/"
otu_file = "/otu_table_L6.txt"

files <- dir(taxa_summary,full.names = TRUE)
for(file in files){
	
	print(file)
	
	name <- basename(file)
	name <- gsub(otu_file,"",name)	
	
	file <- paste0(file,otu_file)
	data <- read.delim(file,skip=1,sep="\t",row.names=1,check.names = FALSE, strinGSAsFactors=FALSE)

	data <- data[apply(data,1,max)>0.01,]
	coeff <- apply(data,2,sum)
	for(i in 1:ncol(data)){
		data[,i] <- 1/coeff[i]*data[,i]
	}
	data <- data[,order(colnames(data))]
	
	rownames(data) <- gsub("k__Bacteria","",rownames(data))
	rownames(data) <- gsub("__","",rownames(data))

	write.table(data,paste0(percentage_matrices,name,".txt"),sep="\t",quote=FALSE)
	
}

Here are the scripts used for extracting filtered BIOM files.

rm(list=ls(all=TRUE))

options(strinGSAsFactors=FALSE)

setwd("/var/MICROBIOTE")

taxa_summary = "./10_otu/"
percentage_matrices = "./13_filtered.biom/"

files <- dir(taxa_summary,full.names = TRUE)
for(file in files){
	
	print(file)
	
	file.copy(paste0(taxa_summary,basename(file),"/otu_table_f.biom"),paste0(percentage_matrices,basename(file),".biom"))
		
}
Here are the scripts used for computating alpha diversities.

#!/bin/sh
cd /var/MICROBIOTE/
conda activate qiime1
alpha_diversity.py -i ./13_filtered.biom/ -o ./14_alpha.diversities/ -m simpson
Here are the scripts used for extracting differential analyses.

#!/bin/sh
cd /var/MICROBIOTE/
conda activate qiime1
differential_abundance.py -i /var/MICROBIOTE/13_filtered.biom/ -o /var/MICROBIOTE/14_differential.analyses -m map.txt -a metagenomeSeq_fitZIG -c Progesterone -x secretory -y proliferative
Here are the scripts used for extracting Bifidobacteriaceae associated with animal CB804C in the vaginal compartiment.

rm(list=ls(all=TRUE))

setwd("/var/MICROBIOTE")

assignments  <- read.delim("./VAG-CB804C_rep_set_tax_assignments.txt",header=FALSE,stringsAsFactors=FALSE)
assignments <- assignments[grep("f__Bifidobacteriaceae",assignments$V2),]
str(assignments)

reads        <- read.delim("./VAG-CB804C_rep_set_aligned.fasta",header=FALSE,stringsAsFactors=FALSE)
id           <- reads[seq(1,nrow(reads),2),]
seq          <- reads[seq(2,nrow(reads),2),]

res          <- data.frame(id,seq)
res$seq      <- gsub("-","",res$seq)
res$denovo   <- as.vector(sapply(res$id,function(x) paste(strsplit(x," ")[[1]][c(1)],collapse=" ")))
res$denovo   <- gsub(">","",res$denovo)
str(res)

res          <- res[res$denovo %in% assignments$V1,]
res$id       <- NULL
res          <- res[,c("denovo","seq")]
res$denovo   <- paste0(">",res$denovo,"\n")
write.table(res,"./VAG-CB804C-Bifidobacteriaceae.fasta",sep="",quote=FALSE,row.names=FALSE,col.names=FALSE)