I have data on abundance over a sequence of nucleotides (ordered character sequence of A, T, C, and G) by two different categories. I want to use ggplot to show the coverage as a line plot over the sequence. I've gotten reasonably far and have just a few more hurdles to jump. Sample data:
> dput(so.sample)
structure(list(`1_mes_wt` = c(0, 0, 4.25897346349789, 10.0666645500859,
10.0666645500859, 48.3974257215669, 78.2102399660521, 250.698665237717,
250.698665237717, 268.702507606139, 271.79994285232, 301.225577691032,
301.225577691032, 301.225577691032, 301.225577691032, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 321.746086196977,
321.746086196977, 321.746086196977, 319.616599465228, 308.969165806483,
285.931991163017, 204.817905653671, 195.332010212244, 170.165348837029,
170.165348837029, 170.165348837029, 170.165348837029, 37.3628126570497,
40.8474273090025, 22.456405534807, 22.456405534807, 17.8102526655366,
17.8102526655366, 17.8102526655366, 17.8102526655366, 17.8102526655366,
17.8102526655366, 17.8102526655366, 17.8102526655366, 15.6807659337877,
15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 5.22692197792922, 5.22692197792922,
5.22692197792922, 5.22692197792922, 5.22692197792922, 5.22692197792922,
5.22692197792922, 5.22692197792922, 5.22692197792922, 5.22692197792922,
5.22692197792922, 1.74230732597641, 1.93589702886268, 1.93589702886268,
4.25897346349789, 2.71025584040775, 2.71025584040775, 68.5307548217387,
89.6320324363419, 90.0192118421144, 93.310236791181, 93.1166470882947,
93.5038264940673, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.6974161969535, 93.6974161969535, 93.6974161969535,
93.6974161969535, 93.6974161969535, 93.6974161969535, 93.6974161969535,
93.6974161969535, 93.6974161969535, 89.2448530305694, 77.2422914516208,
1.16153821731761, 0, 0, 0, 0), `2_mes_wt` = c(0, 0, 13.3249362857652,
41.0267775114349, 41.3774337294814, 100.988990797378, 100.988990797378,
341.188500159198, 343.643093685523, 359.422623497613, 381.163309016493,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
426.047304926439, 366.085091640496, 308.928128098924, 201.977981594756,
201.977981594756, 205.133887557174, 205.133887557174, 205.133887557174,
85.5601172033343, 98.8850534890995, 17.1821546842762, 17.1821546842762,
17.1821546842762, 17.1821546842762, 17.1821546842762, 17.1821546842762,
16.8314984662297, 16.8314984662297, 16.8314984662297, 16.8314984662297,
50.8451516167356, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 39.2734964212026, 37.5202153309704,
42.0787461655743, 54.7023700152465, 54.7023700152465, 54.7023700152465,
113.261958429004, 114.313927083143, 114.313927083143, 116.067208173376,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 74.3391182258479,
63.4687754664078, 0, 0, 0, 0, 0), nucl = 47738064:47738184, base = c("T",
"C", "A", "A", "A", "A", "A", "A", "G", "A", "C", "T", "A", "G",
"T", "C", "A", "A", "G", "T", "G", "C", "A", "G", "T", "A", "G",
"T", "G", "A", "G", "A", "A", "G", "G", "G", "G", "G", "G", "A",
"A", "A", "G", "T", "G", "T", "A", "G", "A", "A", "C", "A", "G",
"G", "A", "G", "T", "T", "C", "A", "A", "T", "C", "T", "G", "T",
"A", "A", "C", "T", "G", "A", "C", "T", "G", "T", "G", "A", "A",
"C", "A", "A", "T", "C", "A", "A", "T", "T", "G", "A", "G", "A",
"T", "A", "A", "C", "T", "C", "A", "C", "T", "A", "C", "C", "T",
"T", "C", "G", "G", "A", "C", "C", "A", "G", "C", "C", "A", "A",
"T", "G", "C")), .Names = c("1_mes_wt", "2_mes_wt", "nucl", "base"
), row.names = c(NA, -121L), class = "data.frame")
To show both the sequence (base) and the sequence positions (nucl) I plot the position as factors and then use a text geom to label the axis with the sequence. It works reasonably well:
pdf("so_sample.pdf",width=15,height=7)
ggplot(melt(so.sample,id.vars=c("nucl","base")), aes(factor(nucl),
value, group=variable, colour=variable)) +
geom_line() + geom_text(y=-5,size=3,aes(label=base))
dev.off()
However, plotting these large numbers as factors causes major overlapping and I don't need to see each position label anyway. So I remove some of the breaks.
pdf("so_sample2.pdf",width=15,height=7)
ggplot(melt(so.sample,id.vars=c("nucl","base")), aes(factor(nucl),
value, group=variable, colour=variable)) +
geom_line() + geom_text(y=-5,size=3,aes(label=base)) +
scale_x_discrete(breaks=seq(min(so.sample$nucl),
max(so.sample$nucl),10))
dev.off()
I am almost there, but in removing the breaks I have also removed the minor grid lines that are particularly useful alongside the character sequence. Is there any way of adding minor grid lines independent of the breaks specified in scale_x_discrete
?
And one side question: The sequence labeling works because it is plotting the same characters in the exact same position for each grouping, and the final colour of these labels is determined by the order of plotting the groups. I would prefer the sequence to be a neutral colour, but I have no idea if it is even possible to give geom_text
these values independent of the groupings.
Answer
Leaving nucl
as is without wrapping it in call to factor
allows you to use scale_x_continuous
, which unlike scale_x_discrete
, has a minor_breaks
argument.
No comments:
Post a Comment