I want to plot interaction effects on my data set. I cannot share the full data set due to confidentiality reasons, but I have added the output of the effect()
function, which can be substituted into the plot functions to reproduce my results
The interaction effect is between two continuous variables, varP
and varL
. The model is a binomial probit glm. I am using the effects()
package.
Calling base R plot()
yields the following chart:
plot(effect("varL:varP", hx.x), multiline = TRUE)
Calling ggplot
yields an error, because ggplot2
cannot deal with continuous variables (Error: A continuous variable can not be mapped to linetype
). So I decided to use cut()
to transform the continuous into a categorical variable and redo the regression. Calling ggplot
now yields the following plot:
ggplot(data.frame(effect("varL:varP_range", hx.x1))) +
geom_line(aes(varL,fit,linetype=varP_range)) +
theme_bw() +
geom_point(aes(varL,fit, shape = varP_range), size = 4)
The effect()
call results in the following data frame:
structure(list(varL = c(0, 4900000, 9800000, 1.5e+07, 2e+07,
0, 4900000, 9800000, 1.5e+07, 2e+07, 0, 4900000, 9800000, 1.5e+07,
2e+07, 0, 4900000, 9800000, 1.5e+07, 2e+07, 0, 4900000, 9800000,
1.5e+07, 2e+07, 0, 4900000, 9800000, 1.5e+07, 2e+07, 0, 4900000,
9800000, 1.5e+07, 2e+07, 0, 4900000, 9800000, 1.5e+07, 2e+07),
varP_range = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L), .Label = c("(0,0.1]", "(0.1,0.2]", "(0.2,0.3]",
"(0.3,0.4]", "(0.4,0.5]", "(0.5,0.6]", "(0.6,0.7]", "(0.7,0.8]",
NA), class = "factor"), fit = c(0.0493753018091432, 0.0674980435061065,
0.0903776022141578, 0.120475525568566, 0.155489425305936,
0.0137978311572348, 0.0146948327184384, 0.0156415523658014,
0.0167030042368772, 0.0177811545669692, 0.0283241226002015,
0.0320688611660079, 0.0362132772188478, 0.0410808356583781,
0.0462488565292175, 0.0376893470992477, 0.0434689290038736,
0.0499436899670642, 0.0576329205782917, 0.0658761148111993,
0.0461027343009466, 0.0491516310196363, 0.0523594626652533,
0.0559432088894986, 0.0595689358953948, 0.0271243286792884,
0.0288090021916312, 0.0305797680715237, 0.0325565879397662,
0.0345556259933517, 0.0585874566843392, 0.0695486360371181,
0.0820248510196267, 0.0970343941431089, 0.113280038210583,
0.0209075509863267, 0.0315251673839061, 0.046253681947139,
0.0674584535698224, 0.0942769896468577), se = c(0.026293040668674,
0.0305781859107515, 0.0540612604209606, 0.0833019722980397,
0.112487542810938, 0.0338534463222995, 0.0398771851800222,
0.0731653210576088, 0.113913900015802, 0.15437353282528,
0.0409108254231718, 0.0511475671899819, 0.0972355260310069,
0.152187250282481, 0.206403037601197, 0.0130040830190118,
0.0163131582968098, 0.03036212612247, 0.0472104967827754,
0.0638665398985353, 0.00832820196917705, 0.0121268883535081,
0.0232740805148284, 0.0361535949146859, 0.0487814483071773,
0.0113362224665359, 0.0147098330114556, 0.027542020584634,
0.0427882020481342, 0.057830481511906, 0.01601220706873,
0.0215340145342622, 0.040911260287911, 0.0636748713064177,
0.0860732173940343, 0.0419361989774607, 0.0474273262485864,
0.085507397635156, 0.132979171779259, 0.180309829530109),
lower = c(0.0443332267166309, 0.0600183728153565, 0.0743144400165266,
0.0907939516369112, 0.108668209187463, 0.0116238722892657,
0.012023490450893, 0.0108135425488573, 0.00937570172867644,
0.00809811278621186, 0.0235056239743921, 0.025500658500369,
0.0234608868503482, 0.0208470526400479, 0.0184481609874694,
0.0356438216686479, 0.0406013691901053, 0.0441057429680943,
0.0477084891588961, 0.0512971927819841, 0.0445466712620454,
0.0467806376034436, 0.0476567944606461, 0.048393808464277,
0.0490625016612481, 0.0257635103540086, 0.0269634191042765,
0.0270321095726769, 0.0269061995593561, 0.0267392100653416,
0.0550074959164803, 0.0640838876027606, 0.0705422656571929,
0.0773047979652944, 0.0841065921246788, 0.017106637503393,
0.025481438787035, 0.0321633667203674, 0.0395762352261805,
0.0476311065775831), upper = c(0.0548650233126245, 0.0756784493915628,
0.108885802402636, 0.156404820719125, 0.213976690522108,
0.0163138248731072, 0.0178616891841985, 0.0222128708020368,
0.0284614557616305, 0.0359832219741959, 0.0339378335208811,
0.039975201853988, 0.0541535055907615, 0.0749340625856128,
0.100655675600276, 0.0398297048294564, 0.0464977933470156,
0.0563819985882714, 0.0691131684577491, 0.0834778881744663,
0.0477021593328273, 0.0516176358055639, 0.0574233019439704,
0.0643924091986213, 0.0717620710757599, 0.0285446055381808,
0.0307584171780848, 0.0345046440925246, 0.0391516285886827,
0.0441582201173176, 0.062347826961093, 0.0753653653841884,
0.0948621519669373, 0.120230399782042, 0.149038245196167,
0.0254001895885684, 0.0387079476679479, 0.0649209421145592,
0.108535107340218, 0.16815843130232)), .Names = c("varL",
"varP_range", "fit", "se", "lower", "upper"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40"), class = "data.frame")
My questions are:
1) How does effect()
from the effects
package transform the continuous into a categorical variable? How would this be replicated with ggplot()
?
2) What is the reason that the lines in the base R plot all intersect at the same coordinate?
Answer
To answer your first question: To define the factors, Effects
uses
nice(seq(min(var), max(var), length.out=5))
where nice is defined like so (from here):
nice <- function (x, direction = c("round", "down", "up"), lead.digits = 1) {
direction <- match.arg(direction)
if (length(x) > 1){
result <- sapply(x, nice, direction = direction, lead.digits = lead.digits)
if (anyDuplicated(result)) result <- nice(x, direction=direction, lead.digits = lead.digits + 1)
return(result)
}
if (x == 0)
return(0)
power.10 <- floor(log(abs(x), 10))
if (lead.digits > 1)
power.10 <- power.10 - lead.digits + 1
lead.digit <- switch(direction, round = round(abs(x)/10^power.10),
down = floor(abs(x)/10^power.10), up = ceiling(abs(x)/10^power.10))
sign(x) * lead.digit * 10^power.10
}
An example of using it:
library(effects)
set.seed(123)
x = rnorm(100)
z = rexp(100)
y = factor(sample(1:2, 100, replace=T))
test = glm(y~x+z+x*z, family = binomial(link = "probit"))
preddat <- matrix('', 25, 100)
preddat <- expand.grid(nice(seq(min(x), max(x), length.out=5)), nice(seq(min(z), max(z), length.out=5)))
colnames(preddat) <- c("x", "z")
predicts <- predict(test, preddat, type = "response")
dim(predicts) <- c(5,5)
effectspred <- pnorm(effect("x:z", test)$fit)
dim(effectspred) <- c(5,5)
all.equal(effectspred, predicts)
[1] TRUE
This is straight forward to use with ggplot:
library(tidyverse)
predicts %>% as.data.frame() %>%
gather() %>% ggplot() + geom_line(aes(x = rep(nice(seq(min(x), max(x), length.out=5)), 5), y = value, color=key))
And regarding question 2, an intuitive way to think about it might be that, since the mean of Yhat doesn't change (predicted with fixed z), all the standard normal CDFs of the partial effects share the same intersection. E.g.
sapply(seq(0, 5, length.out = 15), function(k) {
predict(test, data.frame(x = seq(-20, 20, length.out = 200), z = k), type = "response")}) %>%
as.data.frame() %>% gather() %>% ggplot() +
geom_line(aes(x = rep(seq(-20, 20, length.out = 200), 15), y = value, color = key))
No comments:
Post a Comment