bio-de-visualization

📁 gptomics/bioskills 📅 Jan 24, 2026
4
总安装量
4
周安装量
#53998
全站排名
安装命令
npx skills add https://github.com/gptomics/bioskills --skill bio-de-visualization

Agent 安装分布

trae 2
windsurf 1
opencode 1
codex 1
github-copilot 1
claude-code 1

Skill 文档

DE Visualization

Create visualizations for differential expression analysis using DESeq2 and edgeR built-in plotting functions.

Scope

This skill covers DE-specific built-in functions:

  • DESeq2: plotMA(), plotPCA(), plotDispEsts(), plotCounts()
  • edgeR: plotMD(), plotBCV(), plotMDS()
  • Sample distance heatmaps and p-value distributions

For custom ggplot2/matplotlib implementations of volcano, MA, and PCA plots, see data-visualization/specialized-omics-plots.

Required Libraries

library(DESeq2)
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
library(ggrepel)  # For labeled points

Installation

install.packages(c('ggplot2', 'pheatmap', 'RColorBrewer', 'ggrepel'))
# Optional: Enhanced volcano plots
BiocManager::install('EnhancedVolcano')

MA Plot

DESeq2 MA Plot

# Built-in MA plot
plotMA(res, ylim = c(-5, 5), main = 'MA Plot')

# With custom alpha
plotMA(res, alpha = 0.05, ylim = c(-5, 5))

# Highlight specific genes
plotMA(res, ylim = c(-5, 5))
with(subset(res, padj < 0.01 & abs(log2FoldChange) > 2),
     points(baseMean, log2FoldChange, col = 'red', pch = 20))

Custom ggplot2 MA Plot

res_df <- as.data.frame(res)
res_df$significant <- res_df$padj < 0.05 & !is.na(res_df$padj)

ggplot(res_df, aes(x = log10(baseMean), y = log2FoldChange, color = significant)) +
    geom_point(alpha = 0.5, size = 1) +
    scale_color_manual(values = c('grey60', 'red')) +
    geom_hline(yintercept = 0, linetype = 'dashed') +
    labs(x = 'log10(Mean Expression)', y = 'log2 Fold Change', title = 'MA Plot') +
    theme_bw() +
    theme(legend.position = 'bottom')

edgeR MA Plot

# Using plotMD (mean-difference plot)
plotMD(qlf, main = 'MD Plot')
abline(h = c(-1, 1), col = 'blue', lty = 2)

Volcano Plot

Basic Volcano Plot

res_df <- as.data.frame(res)
res_df$significant <- res_df$padj < 0.05 & abs(res_df$log2FoldChange) > 1

ggplot(res_df, aes(x = log2FoldChange, y = -log10(pvalue), color = significant)) +
    geom_point(alpha = 0.5, size = 1) +
    scale_color_manual(values = c('grey60', 'red')) +
    geom_vline(xintercept = c(-1, 1), linetype = 'dashed', color = 'blue') +
    geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'blue') +
    labs(x = 'log2 Fold Change', y = '-log10(p-value)', title = 'Volcano Plot') +
    theme_bw()

Volcano with Gene Labels

res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)
res_df$significant <- res_df$padj < 0.05 & abs(res_df$log2FoldChange) > 1

# Label top genes
top_genes <- head(res_df[order(res_df$padj), ], 10)

ggplot(res_df, aes(x = log2FoldChange, y = -log10(pvalue))) +
    geom_point(aes(color = significant), alpha = 0.5, size = 1) +
    scale_color_manual(values = c('grey60', 'red')) +
    geom_text_repel(data = top_genes, aes(label = gene),
                    size = 3, max.overlaps = 20) +
    geom_vline(xintercept = c(-1, 1), linetype = 'dashed') +
    geom_hline(yintercept = -log10(0.05), linetype = 'dashed') +
    labs(x = 'log2 Fold Change', y = '-log10(p-value)') +
    theme_bw()

EnhancedVolcano

library(EnhancedVolcano)

EnhancedVolcano(res,
    lab = rownames(res),
    x = 'log2FoldChange',
    y = 'pvalue',
    pCutoff = 0.05,
    FCcutoff = 1,
    title = 'Differential Expression',
    subtitle = 'Treatment vs Control')

PCA Plot

DESeq2 PCA

# Variance stabilizing transformation first
vsd <- vst(dds, blind = FALSE)

# Basic PCA
plotPCA(vsd, intgroup = 'condition')

# With more options
plotPCA(vsd, intgroup = c('condition', 'batch'), ntop = 500)

Custom PCA with ggplot2

vsd <- vst(dds, blind = FALSE)
pca_data <- plotPCA(vsd, intgroup = c('condition', 'batch'), returnData = TRUE)
percentVar <- round(100 * attr(pca_data, 'percentVar'))

ggplot(pca_data, aes(x = PC1, y = PC2, color = condition, shape = batch)) +
    geom_point(size = 4) +
    xlab(paste0('PC1: ', percentVar[1], '% variance')) +
    ylab(paste0('PC2: ', percentVar[2], '% variance')) +
    ggtitle('PCA Plot') +
    theme_bw() +
    theme(legend.position = 'right')

edgeR PCA (via limma)

library(limma)
log_cpm <- cpm(y, log = TRUE)
plotMDS(log_cpm, col = as.numeric(group), pch = 16)
legend('topright', legend = levels(group), col = 1:nlevels(group), pch = 16)

Heatmaps

Top DE Genes Heatmap

library(pheatmap)

# Get top significant genes
sig_genes <- rownames(subset(res, padj < 0.01))

# Get normalized counts
vsd <- vst(dds, blind = FALSE)
mat <- assay(vsd)[sig_genes, ]

# Scale by row (z-score)
mat_scaled <- t(scale(t(mat)))

# Create annotation
annotation_col <- data.frame(
    condition = colData(dds)$condition,
    row.names = colnames(mat)
)

pheatmap(mat_scaled,
         annotation_col = annotation_col,
         show_rownames = FALSE,
         clustering_distance_rows = 'correlation',
         clustering_distance_cols = 'correlation',
         color = colorRampPalette(c('blue', 'white', 'red'))(100),
         main = 'Top DE Genes')

Sample Distance Heatmap

vsd <- vst(dds, blind = FALSE)

# Calculate sample distances
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)

# Annotation
annotation <- data.frame(
    condition = colData(dds)$condition,
    row.names = colnames(dds)
)

pheatmap(sampleDistMatrix,
         annotation_col = annotation,
         annotation_row = annotation,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         color = colorRampPalette(c('white', 'steelblue'))(100),
         main = 'Sample Distance Matrix')

Gene Expression Heatmap

# Select genes of interest
genes_of_interest <- c('gene1', 'gene2', 'gene3', 'gene4', 'gene5')
mat <- assay(vsd)[genes_of_interest, ]

pheatmap(mat,
         scale = 'row',
         annotation_col = annotation_col,
         show_rownames = TRUE,
         cluster_cols = TRUE,
         cluster_rows = TRUE,
         main = 'Genes of Interest')

Dispersion Plot

DESeq2

plotDispEsts(dds, main = 'Dispersion Estimates')

edgeR

plotBCV(y, main = 'Biological Coefficient of Variation')

Counts Plot for Individual Genes

DESeq2

# Plot counts for a specific gene
plotCounts(dds, gene = 'GENE_NAME', intgroup = 'condition')

# With ggplot2
d <- plotCounts(dds, gene = 'GENE_NAME', intgroup = 'condition', returnData = TRUE)
ggplot(d, aes(x = condition, y = count, color = condition)) +
    geom_point(position = position_jitter(width = 0.1), size = 3) +
    scale_y_log10() +
    ggtitle('GENE_NAME Expression') +
    theme_bw()

edgeR

# Get CPM for a gene
gene_idx <- which(rownames(y) == 'GENE_NAME')
cpm_gene <- cpm(y)[gene_idx, ]

# Plot
df <- data.frame(cpm = cpm_gene, group = group)
ggplot(df, aes(x = group, y = cpm, color = group)) +
    geom_point(position = position_jitter(width = 0.1), size = 3) +
    scale_y_log10() +
    labs(y = 'CPM', title = 'GENE_NAME Expression') +
    theme_bw()

P-value Histogram

# Check p-value distribution (should be uniform under null with peak near 0)
res_df <- as.data.frame(res)
ggplot(res_df, aes(x = pvalue)) +
    geom_histogram(bins = 50, fill = 'steelblue', color = 'white') +
    labs(x = 'P-value', y = 'Frequency', title = 'P-value Distribution') +
    theme_bw()

Saving Plots

# Save as PDF (vector)
pdf('volcano_plot.pdf', width = 8, height = 6)
# ... plot code ...
dev.off()

# Save as PNG (raster)
png('volcano_plot.png', width = 800, height = 600, res = 150)
# ... plot code ...
dev.off()

# Using ggsave for ggplot objects
p <- ggplot(...) + ...
ggsave('plot.pdf', p, width = 8, height = 6)
ggsave('plot.png', p, width = 8, height = 6, dpi = 300)

Color Palettes

# For heatmaps
library(RColorBrewer)

# Diverging (for expression: blue-white-red)
colorRampPalette(rev(brewer.pal(n = 7, name = 'RdBu')))(100)

# Sequential (for distances)
colorRampPalette(brewer.pal(n = 9, name = 'Blues'))(100)

# For categorical groups
brewer.pal(n = 8, name = 'Set1')

Quick Reference: Common Plots

Plot Purpose Function
MA plot LFC vs mean expression plotMA(), plotMD()
Volcano LFC vs significance ggplot2, EnhancedVolcano
PCA Sample clustering plotPCA(), plotMDS()
Heatmap Gene patterns pheatmap()
Dispersion Model fit plotDispEsts(), plotBCV()
Counts Individual genes plotCounts()

Related Skills

  • deseq2-basics – Generate DESeq2 results for visualization
  • edger-basics – Generate edgeR results for visualization
  • de-results – Filter genes before visualization
  • data-visualization/specialized-omics-plots – Custom ggplot2 volcano/MA/PCA functions
  • data-visualization/heatmaps-clustering – Advanced heatmap customization