Created
July 31, 2015 08:18
-
-
Save hnagata/d2c607d6b4eba4f17304 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## grid / ggplot ---- | |
| library(ggplot2) | |
| library(grid) | |
| make.grid <- function(row, col) { | |
| grid.newpage() | |
| l <- grid.layout(row, col) | |
| v <- viewport(layout=l) | |
| pushViewport(v) | |
| } | |
| print.at <- function(o, i, j) { | |
| print(o, vp=viewport(layout.pos.row=i, layout.pos.col=j)) | |
| } | |
| end.grid <- function() { | |
| popViewport() | |
| } | |
| plot.to.file <- FALSE | |
| ## データの読み込み ---- | |
| dat <- read.csv("user.csv", fileEncoding="utf-8") | |
| dat$date <- as.POSIXct(dat$date, tz="JST") | |
| dat <- dat[order(dat$user, dat$date), ] | |
| enam <- c("chartInterval", "chartStability", "chartExpressiveness", "chartRhythm", "chartVibratoLongtone") | |
| ## とりあえずlm ---- | |
| lm.std <- lm(totalPoint ~ chartInterval + chartStability + chartExpressiveness + chartVibratoLongtone + chartRhythm, data=dat) | |
| summary(lm.std) | |
| ## ノンパラlm ---- | |
| mask.nonp <- rep(TRUE, nrow(dat)) | |
| # 総合点100点を除外 (打ち切り対策) | |
| mask.nonp <- mask.nonp & dat$totalPoint < 100 | |
| ## 各項目30以上だけ使う (singular 対策) | |
| #mask.nonp <- mask.nonp & apply(dat[, enam], 1, function(row) all(row[enam] >= 30)) | |
| # Singular 対策 | |
| mask.nonp <- mask.nonp & dat$chartRhythm != 0 | |
| mask.nonp <- mask.nonp & dat$chartRhythm != 2 | |
| mask.nonp <- mask.nonp & dat$chartExpressiveness != 13 | |
| # 推定 | |
| do.lm <- function(mask.nonp, use.user=FALSE) { | |
| enam <- if (use.user) c(enam, "user") else enam | |
| dat.lm <- dat[mask.nonp, c("totalPoint", enam)] | |
| for (en in enam) dat.lm[[en]] <- factor(dat.lm[[en]]) | |
| design.lm <- data.frame(model.matrix(~ . - 1, dat.lm)) | |
| if ("chartInterval0" %in% colnames(design.lm)) { | |
| design.lm <- design.lm[, -which(colnames(design.lm) == "chartInterval0")] | |
| } | |
| if (use.user) { | |
| ucol <- paste0("user", levels(dat$user)[-1]) | |
| design.lm[, ucol] <- design.lm[, ucol] - as.numeric(dat$user[mask.nonp] == levels(dat$user)[1]) | |
| } | |
| lm(totalPoint ~ . - 1, design.lm) | |
| } | |
| lm.nonp <- do.lm(mask.nonp) | |
| summary(lm.nonp) | |
| # 係数のプロット | |
| coef.nonp <- coef(lm.nonp) | |
| print.coef <- function(coef.nonp, lower=40) { | |
| value <- c(-1, lower : 100) | |
| df <- data.frame( | |
| elements = factor(rep(enam, each=length(value)), levels=enam), | |
| value = rep(value, 5), | |
| beta = as.vector(sapply(enam, function(en) { | |
| coef.nonp[paste0(en, value)] | |
| })) | |
| ) | |
| f <- function(en) { | |
| ggplot(df[df$elements == en | df$value == -1, ], aes(x=value, y=beta)) + | |
| geom_point(aes(color=elements)) + | |
| scale_x_continuous(en, limits=c(lower, 100), breaks=seq(lower, 100, 10)) + | |
| theme(legend.position="none") | |
| } | |
| make.grid(3, 2) | |
| print.at(f("chartInterval"), 1, 1) | |
| print.at(f("chartStability"), 1, 2) | |
| print.at(f("chartExpressiveness"), 2, 1) | |
| print.at(f("chartRhythm"), 2, 2) | |
| print.at(f("chartVibratoLongtone"), 3, 1) | |
| end.grid() | |
| } | |
| #print.coef(coef.nonp) | |
| # 残差の分布を見る | |
| resid.nonp <- resid(lm.nonp) | |
| print.resid.point <- function(resid.nonp, mask.nonp) { | |
| df <- data.frame( | |
| index = 1 : length(resid.nonp), | |
| resid = resid.nonp, | |
| user = factor(as.numeric(dat$user[mask.nonp]) %% 3, levels=0:2) | |
| ) | |
| make.grid(1, 1) | |
| print.at( | |
| ggplot(df, aes(x=index, y=resid)) + | |
| geom_point(aes(color=user), size=1) + | |
| theme(legend.position="none"), | |
| 1, 1) | |
| end.grid() | |
| } | |
| print.resid.hist <- function(resid.nonp) { | |
| df <- data.frame(resid = resid.nonp) | |
| make.grid(1, 1) | |
| print.at( | |
| ggplot(df, aes(x=resid)) + | |
| geom_histogram(aes(fill=0)) + | |
| theme(legend.position="none"), | |
| 1, 1) | |
| end.grid() | |
| } | |
| if (plot.to.file) svg("total-resid-index.svg", width=6, height=4) | |
| print.resid.point(resid.nonp, mask.nonp) | |
| if (plot.to.file) dev.off() | |
| if (plot.to.file) svg("total-hist-resid.svg", width=6, height=4) | |
| print.resid.hist(resid.nonp) | |
| if (plot.to.file) dev.off() | |
| ## 外れ値を除去 ---- | |
| # ユーザーごとで 99.99% 点より外側のサンプルを除外 | |
| quant.resid <- tapply(resid.nonp, dat$user[mask.nonp], function(r) quantile(r, probs=c(0.4, 0.5, 0.6))) | |
| mask.ol <- mapply( | |
| function(resid, user) { | |
| quant <- quant.resid[[user]] | |
| resid >= quant["50%"] - (quant["60%"] - quant["40%"]) * (qnorm(0.9999) / (2*qnorm(0.6))) | |
| }, | |
| resid.nonp, dat$user[mask.nonp] | |
| ) | |
| mask.nonp2 <- mask.nonp | |
| mask.nonp2[mask.nonp] <- mask.ol | |
| # 判定された外れ値を見る | |
| olresid.df <- data.frame( | |
| index = 1 : length(resid.nonp), | |
| resid = resid.nonp, | |
| col = ifelse(mask.nonp2[mask.nonp], "darkgray", "red") | |
| ) | |
| if (plot.to.file) svg("total-resid-index-ol.svg", width=6, height=4) | |
| make.grid(1, 1) | |
| print.at( | |
| ggplot(olresid.df, aes(x=index, y=resid)) + | |
| geom_point(color=olresid.df$col, size=1) + | |
| theme(legend.position="none"), | |
| 1, 1) | |
| end.grid() | |
| if (plot.to.file) dev.off() | |
| # 推定 | |
| lm.nonp2 <- do.lm(mask.nonp2, use.user=TRUE) | |
| summary(lm.nonp2) | |
| # 係数のプロット | |
| coef.nonp2 <- coef(lm.nonp2) | |
| if (plot.to.file) svg("total-coef-elem.svg", width=8, height=12) | |
| print.coef(coef.nonp2, lower=40) | |
| if (plot.to.file) dev.off() | |
| # 各項目 80 以上での拡大図 | |
| df <- data.frame( | |
| elements = factor(rep(enam, each=21), levels=enam), | |
| value = rep(80 : 100, 5), | |
| beta = as.vector(sapply(enam, function(en) { | |
| ecoef <- coef.nonp2[paste0(en, 80 : 100)] | |
| ecoef - max(ecoef, na.rm=TRUE) | |
| })) | |
| ) | |
| border <- 100 - sum(coef.nonp2[paste0(enam, 100)[-1]]) - coef.nonp2["chartInterval95"] | |
| border | |
| if (plot.to.file) svg("total-coef-elem-o80.svg", width=6, height=4) | |
| make.grid(1, 1) | |
| print.at( | |
| ggplot(df, aes(x=value, y=beta, group=elements)) + | |
| geom_hline(yintercept=border, color=1) + | |
| geom_point(aes(color=elements)) + | |
| geom_line(aes(color=elements)) + | |
| xlim(c(80, 100)), | |
| 1, 1) | |
| end.grid() | |
| if (plot.to.file) dev.off() | |
| # 残差の分布を見る | |
| print.resid.hist(resid(lm.nonp2)) | |
| # 裏加点の box-plot | |
| ura.per.user <- coef.nonp2[paste0("user", levels(dat$user)[-1])] | |
| ura.per.user <- c(-sum(ura.per.user), ura.per.user) | |
| df <- data.frame( | |
| ura = resid(lm.nonp2) + ura.per.user[as.numeric(dat$user[mask.nonp2])], | |
| user = dat$user[mask.nonp2] | |
| ) | |
| df <- df[order(ura.per.user[as.numeric(dat$user[mask.nonp2])], decreasing=TRUE), ] | |
| df$user <- factor(as.numeric(df$user), levels=unique(as.numeric(df$user))) | |
| df$ucol <- factor(as.numeric(df$user) %% 3, levels=0:2) | |
| if (plot.to.file) svg("total-resid-user.svg", width=6, height=4) | |
| make.grid(1, 1) | |
| print.at( | |
| ggplot(df, aes(user, ura)) + | |
| geom_boxplot(aes(color=ucol), outlier.size=1) + | |
| theme(legend.position="none", axis.ticks=element_blank(), axis.text.x=element_blank()), | |
| 1, 1) | |
| end.grid() | |
| if (plot.to.file) dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment