Transcriptome：差异表达分析

不同样本基因差异表达分析
利用Rstudio新建一个名为Differential_Expression_analysis的project，利用以下代码进行分析。仅需修改tpm_input的路径、样本名conditions0 。

xxxxxxxxxx
1
#清空当前环境中的所有变量
2
rm(list = ls())
3
{
4

5
  #如果没有安装"BiocManager"，则进行安装，否则跳过
6
{if (!requireNamespace("BiocManager", quietly = TRUE))
7
  install.packages("BiocManager")
8
}
9
#安装并加载R包"DESeq2"
10
{if (!requireNamespace("DESeq2", quietly = TRUE))
11
  BiocManager::install("DESeq2")
12
}
13

14
library(DESeq2)
15
#输入TPM数据
16
tpm_input <- read.table(file = "G:/R-Demo/counts_to_TPM/all_TPM.txt",
17
                  header = T,row.names = 1,sep = "\t")
18
#去除各转录组中tpm值均为0的基因
19
tpm <- tpm_input[rowSums(tpm_input) != 0,]
20
# 将TPM数据乘以1000并四舍五入，不保留小数
21
tpm <- round(tpm)
22
head(tpm)
23

24
# 创建条件信息:需要因子factor格式
25
#主要是转录组中包含重复，需要指定哪些重复是对应一个样本的，后续需要对不同样本进行比较
26
conditions0 <- c("HF1", "HF2", "HF3", "HGP", "HGX", "HL", "HM1", "HM2",
27
                        "HM3", "HP1", "HP2", "HP3", "HR", "HS", "HZZ")
28

29
# 嵌套循环比较两两条件
30
for (i in 1:(length(conditions0)-1)) {
31
  for (j in (i+1):length(conditions0)) {
32
    condition1 <- conditions0[i]
33
    condition2 <- conditions0[j]
34

35
    # 打印两个条件的比较结果
36
    print(paste("Comparing", condition1, "and", condition2))
37
    
38
    # 在这里执行比较的操作，根据具体需求进行相应的处理
39
#将需要比较的对象与数据中的各列一一对应
40
conditions <- as.factor(rep(c(condition1, condition2),each = 3))
41
#整理countData：将两者数据分别进行查找录入
42
{
43
  tpm_DES1 <- tpm[,grep(condition1, colnames(tpm_input))]
44
  #新增一列Geneid进行合并
45
  tpm_DES1$Geneid <- rownames(tpm_DES1)
46
  tpm_DES2 <- tpm[,grep(condition2, colnames(tpm_input))]
47
  tpm_DES2$Geneid <- rownames(tpm_DES2)
48
  #将两个数据框进行合并，以Geneid为参考
49
  tpm_DES <- merge(tpm_DES1,tpm_DES2, by = "Geneid")
50
  rownames(tpm_DES) <- tpm_DES[,1]
51
  #去除Geneid列
52
  tpm_DES <- tpm_DES[,-1]
53
  #将在各转录组中均为0的基因去掉
54
  tpm_DES <- tpm_DES[rowSums(tpm_DES) != 0,]
55
}
56
# 创建 DESeq2 的数据对象
57
#countData是表达量矩阵或者数据框
58
#colData是使用condition作为差异表达分析的条件变量。
59
#在这种设计下，DESeq2将计算每个基因在不同条件下的差异表达。
60
dds <- DESeqDataSetFromMatrix(countData = tpm_DES,
61
                              colData = data.frame(condition = conditions),
62
                              design = ~ condition)
63

64
#进行差异分析：这一步耗时较长，45个转录组的count文件花费大约2小时
65
dds <- DESeq(dds)
66
#获取差异表达基因的统计结果。
67
res <- results(dds, cooksCutoff = F, independentFiltering = F)
68

69
#查看前几行
70
head(res)
71
#将res转成dataframe格式
72
res_dataframe <- data.frame(res)
73
head(res_dataframe)
74
#如果没有安装dyplr，则安装dyplr，否则会直接跳过
75
{if (!requireNamespace("dplyr", quietly = TRUE))
76
  install.packages("dplyr")
77
}
78
#加载dplyr包
79
library(dplyr)
80
#在res_dataframe中新建group列
81
#根据log2FoldChange和padj条件进行写入“up”或者“down”，将结果赋予res_dataframe2
82
res_dataframe %>%
83
  mutate(group =case_when(
84
    log2FoldChange >= 2 & padj <= 0.05 ~ "UP",
85
    log2FoldChange <= -2 & padj <= 0.05 ~ "DOWN",
86
    TRUE ~ "NOT_Change"
87
)) -> res_dataframe2
88
#统计group列中不同数值的情况
89
table(res_dataframe2$group)
90
#输出路径定义：
91
File_output_path <- "./Differential_Expression_results/"
92
#输出文件名定义：
93
File_name <- paste0(condition1, "_vs_", condition2,".csv")
94

95
output_file <- paste0(File_output_path, File_name)
96
#导出.csv文件，quote=F：禁止在文本中添加“”
97
write.csv(res_dataframe2,file = output_file, quote = F)
98

99
#绘制火山图
100
#加载R包ggplot2
101
library(ggplot2)
102
#可选：筛选-log10(padj)>200 且<300的基因名
103
{
104
res_dataframe2$padj200 <- (-log10(res_dataframe2$padj))
105
#筛选上调UP基因
106
padj200_UP <- res_dataframe2[res_dataframe2$padj200 > 100
107
                             & res_dataframe2$group == "UP",]
108
#筛选下调UP基因
109
padj200_DOWN <- res_dataframe2[res_dataframe2$padj200 > 100 
110
                             & res_dataframe2$group == "DOWN",]
111
#要实现标签尽可能不重叠，可以利用ggrepel包进行修改
112
{if (!requireNamespace("ggrepel", quietly = TRUE))
113
  install.packages("ggrepel")
114
  library("ggrepel")
115
}
116

117
}
118

119
#设置数据来源，一般要求是dataframe格式，x轴是log2FoldChange，y轴是经过-log10()转化的padj
120
plot <- ggplot(res_dataframe2,aes(x=log2FoldChange,y=-log10(padj)))+
121
  #设置绘制类型为“散点”图，标签为group列的三个类别
122
  geom_point(aes(color = group))+
123
  #手动设置类别的填充颜色
124
  scale_color_manual(values = c("dodgerblue","gray","firebrick"))+
125
  #若要显示-log10(padj)>200 且 <300的基因名,则运行这一段代码。
126
  #max.overlaps：避免标签重叠；size：设置标签大小
127
  #上调基因标签
128
  geom_label_repel(data = padj200_UP, aes(x=log2FoldChange,y=-log10(padj)),
129
              label= rownames(padj200_UP), color= "lightpink",
130
              max.overlaps = 23, size =3)+
131
  #下调基因标签
132
  geom_label_repel(data = padj200_DOWN, aes(x=log2FoldChange,y=-log10(padj)),
133
                   label= rownames(padj200_DOWN), color= "skyblue",
134
                   max.overlaps = 23, size =3)+
135
  
136
  theme_bw()
137
plot_output_path <- "./Plots_output/"
138
plot_name <- paste0(condition1, "_vs_", condition2,".pdf")
139

140
output_plot <- paste0(plot_output_path,plot_name)
141
ggsave(filename = output_plot, plot = plot,width = 14, height = 14, units = "in", dpi = 600)
142
{
143
Finished_file <- length(list.files(File_output_path, pattern = "\\.csv$", full.names = TRUE))
144
Finished_plot <- length(list.files(plot_output_path, pattern = "\\.pdf$", full.names = TRUE))
145
# 比较完成后进行下一对条件的比较
146
print(paste0(condition1," vs ",condition2," Comparison complete"))
147
print(paste0(Finished_file," files had completed"))
148
print(paste0(Finished_plot," plots had completed"))
149
#打印已完成的比较组合
150
print(sub("\\.csv$","",list.files(File_output_path, pattern = "\\.csv$", full.names = F)))
151
}  
152
}
153
}
154
}
附表1：差异基因统计结果
差异表达基因的统计结果
附表2：统计group列中不同数值的情况，即上调、下调或没有差异表达
基因差异表达情况
不同样本基因差异表达分析

发表回复 取消回复

发表回复取消回复