Plotting the relationship between video length and funding outcome:
mydata <- read.csv("https://ximarketing.github.io/class/Kickstarter-Project.csv", fileEncoding = "UTF-8-BOM")
subdata = subset(mydata, IsVideoAvailable == 1)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.1
ggplot(subdata, mapping = aes(VideoLength, Outcome)) +
stat_summary_bin(fun.y="mean", geom="bar", bins=60)+xlim(0, 400)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 162 rows containing non-finite values (stat_summary_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Including quadratic terms in your regression:
logit <- glm(Outcome ~ VideoLength, data = subdata, family = "binomial")
summary(logit)
##
## Call:
## glm(formula = Outcome ~ VideoLength, family = "binomial", data = subdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5934 -0.9742 -0.9593 1.3909 1.4258
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5689252 0.0481630 -11.812 <2e-16 ***
## VideoLength 0.0004219 0.0002285 1.846 0.0649 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7017.7 on 5290 degrees of freedom
## Residual deviance: 7014.2 on 5289 degrees of freedom
## AIC: 7018.2
##
## Number of Fisher Scoring iterations: 4
logit <- glm(Outcome ~ VideoLength + I(VideoLength^2), data = subdata, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
##
## Call:
## glm(formula = Outcome ~ VideoLength + I(VideoLength^2), family = "binomial",
## data = subdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1024 -1.0060 -0.8702 1.3246 5.0005
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.192e+00 8.958e-02 -13.307 < 2e-16 ***
## VideoLength 6.541e-03 8.102e-04 8.074 6.81e-16 ***
## I(VideoLength^2) -1.056e-05 1.584e-06 -6.666 2.63e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7017.7 on 5290 degrees of freedom
## Residual deviance: 6929.1 on 5288 degrees of freedom
## AIC: 6935.1
##
## Number of Fisher Scoring iterations: 7
mydata$LogFunding = log(mydata$FundingRaised + 1)
result = lm(LogFunding ~ Created * NumberOfProducts, data = mydata)
summary(result)
##
## Call:
## lm(formula = LogFunding ~ Created * NumberOfProducts, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8916 -2.3080 0.3523 2.2554 8.6714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.039097 0.071653 42.414 < 2e-16 ***
## Created 0.240761 0.042393 5.679 1.41e-08 ***
## NumberOfProducts 0.443064 0.008182 54.148 < 2e-16 ***
## Created:NumberOfProducts -0.012090 0.005019 -2.409 0.016 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.126 on 6954 degrees of freedom
## Multiple R-squared: 0.3358, Adjusted R-squared: 0.3355
## F-statistic: 1172 on 3 and 6954 DF, p-value: < 2.2e-16
subdata = subset(mydata, IsVideoAvailable == 1)
result = lm(LogFunding ~ factor(Gender) * Human, data = subdata)
summary(result)
##
## Call:
## lm(formula = LogFunding ~ factor(Gender) * Human, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8754 -1.8513 0.5323 2.3194 7.8001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.1768 0.3778 13.704 < 2e-16 ***
## factor(Gender)M 0.4710 0.4003 1.177 0.23939
## factor(Gender)U 1.9207 0.4057 4.734 2.26e-06 ***
## Human 2.3467 0.4137 5.672 1.48e-08 ***
## factor(Gender)M:Human -1.1873 0.4411 -2.692 0.00713 **
## factor(Gender)U:Human -0.5688 0.4453 -1.277 0.20153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.228 on 5285 degrees of freedom
## Multiple R-squared: 0.115, Adjusted R-squared: 0.1142
## F-statistic: 137.4 on 5 and 5285 DF, p-value: < 2.2e-16
mydata$LogTarget = log(mydata$Target + 1)
result = lm(LogFunding ~ LogTarget + factor(Subtype) + factor(Location), data = mydata)
summary(result)
##
## Call:
## lm(formula = LogFunding ~ LogTarget + factor(Subtype) + factor(Location),
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0535 -2.3941 0.4245 2.4424 10.4402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.66186 0.40466 16.463 < 2e-16 ***
## LogTarget 0.20560 0.02403 8.556 < 2e-16 ***
## factor(Subtype)Apps -4.19174 0.33504 -12.511 < 2e-16 ***
## factor(Subtype)CameraEquipment 1.30956 0.45434 2.882 0.003960 **
## factor(Subtype)DIYElectronics -0.20024 0.40840 -0.490 0.623940
## factor(Subtype)FabricationTools -2.53178 0.56363 -4.492 7.17e-06 ***
## factor(Subtype)Flight -1.50850 0.47699 -3.163 0.001571 **
## factor(Subtype)Gadgets -0.33102 0.34636 -0.956 0.339246
## factor(Subtype)Hardware 0.18633 0.33815 0.551 0.581627
## factor(Subtype)Makerspaces -1.07444 0.56385 -1.906 0.056752 .
## factor(Subtype)Robots -0.10786 0.43235 -0.249 0.803008
## factor(Subtype)Software -3.07865 0.34560 -8.908 < 2e-16 ***
## factor(Subtype)Sound 0.05697 0.42930 0.133 0.894436
## factor(Subtype)SpaceExploration -0.88965 0.50991 -1.745 0.081076 .
## factor(Subtype)Technology -1.70381 0.33201 -5.132 2.95e-07 ***
## factor(Subtype)Wearables 0.19453 0.38017 0.512 0.608882
## factor(Subtype)Web -4.43896 0.34505 -12.865 < 2e-16 ***
## factor(Location)IL -0.96675 0.16151 -5.986 2.26e-09 ***
## factor(Location)MA 0.07324 0.15776 0.464 0.642465
## factor(Location)NY -0.36623 0.11099 -3.300 0.000973 ***
## factor(Location)TX -1.31772 0.11932 -11.043 < 2e-16 ***
## factor(Location)WA -0.63908 0.16752 -3.815 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.306 on 6936 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.2566
## F-statistic: 115.3 on 21 and 6936 DF, p-value: < 2.2e-16