Saturday, 24 September 2016

r - Adding grid lines to a ggplot independent of axis breaks



I have data on abundance over a sequence of nucleotides (ordered character sequence of A, T, C, and G) by two different categories. I want to use ggplot to show the coverage as a line plot over the sequence. I've gotten reasonably far and have just a few more hurdles to jump. Sample data:



> dput(so.sample)
structure(list(`1_mes_wt` = c(0, 0, 4.25897346349789, 10.0666645500859,
10.0666645500859, 48.3974257215669, 78.2102399660521, 250.698665237717,
250.698665237717, 268.702507606139, 271.79994285232, 301.225577691032,

301.225577691032, 301.225577691032, 301.225577691032, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 311.292242241118,
311.292242241118, 311.292242241118, 311.292242241118, 321.746086196977,
321.746086196977, 321.746086196977, 319.616599465228, 308.969165806483,
285.931991163017, 204.817905653671, 195.332010212244, 170.165348837029,
170.165348837029, 170.165348837029, 170.165348837029, 37.3628126570497,
40.8474273090025, 22.456405534807, 22.456405534807, 17.8102526655366,
17.8102526655366, 17.8102526655366, 17.8102526655366, 17.8102526655366,
17.8102526655366, 17.8102526655366, 17.8102526655366, 15.6807659337877,

15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 15.6807659337877, 15.6807659337877,
15.6807659337877, 15.6807659337877, 5.22692197792922, 5.22692197792922,
5.22692197792922, 5.22692197792922, 5.22692197792922, 5.22692197792922,
5.22692197792922, 5.22692197792922, 5.22692197792922, 5.22692197792922,
5.22692197792922, 1.74230732597641, 1.93589702886268, 1.93589702886268,
4.25897346349789, 2.71025584040775, 2.71025584040775, 68.5307548217387,
89.6320324363419, 90.0192118421144, 93.310236791181, 93.1166470882947,
93.5038264940673, 93.8910058998398, 93.8910058998398, 93.8910058998398,

93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.8910058998398, 93.8910058998398, 93.8910058998398,
93.8910058998398, 93.6974161969535, 93.6974161969535, 93.6974161969535,
93.6974161969535, 93.6974161969535, 93.6974161969535, 93.6974161969535,
93.6974161969535, 93.6974161969535, 89.2448530305694, 77.2422914516208,
1.16153821731761, 0, 0, 0, 0), `2_mes_wt` = c(0, 0, 13.3249362857652,
41.0267775114349, 41.3774337294814, 100.988990797378, 100.988990797378,
341.188500159198, 343.643093685523, 359.422623497613, 381.163309016493,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,

450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
450.242583971645, 450.242583971645, 450.242583971645, 450.242583971645,
426.047304926439, 366.085091640496, 308.928128098924, 201.977981594756,
201.977981594756, 205.133887557174, 205.133887557174, 205.133887557174,
85.5601172033343, 98.8850534890995, 17.1821546842762, 17.1821546842762,
17.1821546842762, 17.1821546842762, 17.1821546842762, 17.1821546842762,
16.8314984662297, 16.8314984662297, 16.8314984662297, 16.8314984662297,
50.8451516167356, 52.5984327069678, 52.5984327069678, 52.5984327069678,

52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 52.5984327069678, 52.5984327069678,
52.5984327069678, 52.5984327069678, 39.2734964212026, 37.5202153309704,
42.0787461655743, 54.7023700152465, 54.7023700152465, 54.7023700152465,
113.261958429004, 114.313927083143, 114.313927083143, 116.067208173376,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,

82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 82.0535550228698,
82.0535550228698, 82.0535550228698, 82.0535550228698, 74.3391182258479,
63.4687754664078, 0, 0, 0, 0, 0), nucl = 47738064:47738184, base = c("T",
"C", "A", "A", "A", "A", "A", "A", "G", "A", "C", "T", "A", "G",
"T", "C", "A", "A", "G", "T", "G", "C", "A", "G", "T", "A", "G",
"T", "G", "A", "G", "A", "A", "G", "G", "G", "G", "G", "G", "A",
"A", "A", "G", "T", "G", "T", "A", "G", "A", "A", "C", "A", "G",

"G", "A", "G", "T", "T", "C", "A", "A", "T", "C", "T", "G", "T",
"A", "A", "C", "T", "G", "A", "C", "T", "G", "T", "G", "A", "A",
"C", "A", "A", "T", "C", "A", "A", "T", "T", "G", "A", "G", "A",
"T", "A", "A", "C", "T", "C", "A", "C", "T", "A", "C", "C", "T",
"T", "C", "G", "G", "A", "C", "C", "A", "G", "C", "C", "A", "A",
"T", "G", "C")), .Names = c("1_mes_wt", "2_mes_wt", "nucl", "base"
), row.names = c(NA, -121L), class = "data.frame")


To show both the sequence (base) and the sequence positions (nucl) I plot the position as factors and then use a text geom to label the axis with the sequence. It works reasonably well:




pdf("so_sample.pdf",width=15,height=7)
ggplot(melt(so.sample,id.vars=c("nucl","base")), aes(factor(nucl),
value, group=variable, colour=variable)) +
geom_line() + geom_text(y=-5,size=3,aes(label=base))
dev.off()


enter image description here




However, plotting these large numbers as factors causes major overlapping and I don't need to see each position label anyway. So I remove some of the breaks.



pdf("so_sample2.pdf",width=15,height=7)
ggplot(melt(so.sample,id.vars=c("nucl","base")), aes(factor(nucl),
value, group=variable, colour=variable)) +
geom_line() + geom_text(y=-5,size=3,aes(label=base)) +
scale_x_discrete(breaks=seq(min(so.sample$nucl),
max(so.sample$nucl),10))
dev.off()



enter image description here



I am almost there, but in removing the breaks I have also removed the minor grid lines that are particularly useful alongside the character sequence. Is there any way of adding minor grid lines independent of the breaks specified in scale_x_discrete?



And one side question: The sequence labeling works because it is plotting the same characters in the exact same position for each grouping, and the final colour of these labels is determined by the order of plotting the groups. I would prefer the sequence to be a neutral colour, but I have no idea if it is even possible to give geom_text these values independent of the groupings.


Answer



Leaving nucl as is without wrapping it in call to factor allows you to use scale_x_continuous, which unlike scale_x_discrete, has a minor_breaks argument.


No comments:

Post a Comment

c++ - Does curly brackets matter for empty constructor?

Those brackets declare an empty, inline constructor. In that case, with them, the constructor does exist, it merely does nothing more than t...