导图社区 数据分析实战
R语言数据分析实战,让数据分析不再困难!
编辑于2020-09-22 19:52:58数据分析
线性回归
fit<-lm(weight~height,data=women) summary(fit)/summary.lm(fit)
coefficients(fit)
confint(fit,level=0.5)置信区间
fitted(fit)
residuals(fit) 残差值
women1<-women predict(fit,women1)
plot(women$height,women$weight) abline(fit)绘制出拟合曲线 lines(women$height,fitted(fit2),col=“red”) lines(women$height,fitted(fit3),col=“blue”)
fit2<-lm(weight~height+Ⅰ(height*2),data=women) summary(fit2)
fit2<-lm(weight~height+Ⅰ(height*2)+Ⅰ(height*3),data=women)
多元线性回归
states<-as.data.frame(state.x77[,c(“Murder”,“Population”,“Illiteracy”,“Income”,“Frost”)])
fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
summary(fit) options(digits=4) coef(fit)
fit<-lm(mpg~hp+wt+hp:wt,data=mtcars) summary(fit)
AIC比较模型好坏
fit1<-lm(Murder~Population+Illiteracy+Income+Frost,data=states) fit2<-lm(Murder~Population+Illiteracy,data=states) AIC(fit1,fit2)
回归诊断
opar<-par(no.readonly=TRUE) fit<-lm(weight~height,data=women) par(mfrow=c(2,2)) plot(fit)
方差分析
install.package(multcomp) library(multcomp)
attach(cholesterol) table(trt) aggregate(response,by=list(trt),FUN=mean)
fit<-aov(respnose~trt,data=cholesterol) summary(fit) plot(fit)
fit.lm<-lm(respnose~trt,data=cholesterol)
单因素协方差
table(litter$dose) attach(litter) aggregate(weight,by=list(dose),FUN=mean)
fit<-aov(weight~gesttime+dose,data=litter) summary(fit)
双因素方差分析
attach(ToothGrowth) xtabs(~supp+dose)统计频率 aggregate(len,by=list(supp,dose),FUN=mean) aggregate(len,by=list(supp,dose),FUN=sd)
转换为因子ToothGrowth$dose<-factor(ToothGrowth$dose) fit<-aov(len~supp*dose,data=ToothGrowth) summary(fit)
interaction.plot(dose,supp,len,tybe=“b”,col=c(“red”,“blue”),pch=c(16,18),main=“Interaction between Dose and Supplement Type”)
多元方差分析
library(MASS) UScereal attach(UScereal)加载数据集 shelf<-factor(shelf) aggregate(cbind(calories,fat,sugars),by=list(shelf),FUN=mean)
fit<-manova(cbind(calories,fat,sugars)~shelf) summary(fit) summary.aov(fit)
功效分析 (建模以及检验数据关系)
功效=错误Ⅰ-错误Ⅱ的概率 真实效应发生的概率
效应值指备择或研究假设下效应的量。 效应值的表达式依赖于假设检验中使用的统计方法
四个基本量(给定任意三个,可推算第四个)
功效1-p(Ⅱ型错误) 效应值ES 样本大小n 显著性水平p(Ⅰ型错误)
线性回归功效分析
pwr包 pwr.f2.test(u=3,sig.level=0.05,power=0.9,f2=0.0769) options(digits=0.5)
方差分析功效分析
pwr.anova.test(k=2,f=0.25,sig.level=0.05,power=0.9) f效应值
广义线性模型 (二项,泊松,伽马分布等) 极大似然法估计
泊松回归
用来为计数资料和列联表建模的一种回归分析 假设因变量是泊松分布 假设它平均值的对数可被未知参数的线性组合建模
data(breslow.dat,package=“robust”)加载数据集 summary(breslow.dat) attach(breslow.dat) fit<-glm(sumY~Base+Age+Trt,data=breslow.dat,family=poisson(link=“log”)) summary(fit) coef(fit) exp(coef(fit))
logistic回归
通过一系列连续型或类别型预测变量预测二值型结果变量 根据危险因素预测某疾病发生的概率
data=(Affairs,package=“AER”) summary(Affairs) table(Affairs$affairs) prop.table(table(Affairs$affairs)) prop.table(table(Affairs$gender))
Affairs$ynaffair[Affairs$affairs>0]<-1 Affairs$ynaffair[Affairs$affairs==0]<-0 head(Affairs) Affairs$ynaffair<-factor(Affairs$ynaffair,levels=c(0,1),labels=(“No”,“Yes”)) table(Affairs$ynaffair)
attach(Affairs) fit<-glm(ynaffair~gender+age+yearsmarried+children+religiousness+education+occupation+rating,data=Affairs,family=binomial()) summary(fit) fit1<-glm(ynaffair~age+yearsmarried+religiousness+rating,data=Affairs,family=binomial()) anova(fit,fit1,test=“Chisq”)卡方检验 exp(coef(fit))
testdata<-data.frame(rating=c(1,2,3,4,5),age=mean(Affairs$age),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness)) head(testdata) testdata$prob<-predict(fit1,newdata=testdata,type=“response”) testdata<-data.frame(rating=mean(Affairs$rating),age=seq(17,57,10),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness)) head(testdata) testdata$prob<-predict(fit1,newdata=testdata,type=“response”)
主成分分析
简称PCA,数据降维 将大量相关变量转化为一组很少的不相关变量 对原始变量重新进行线性组合,将原先众多具有一定相关性的指标,重新组合为一组新的相互独立的综合指标
psych包 USJudgeRatings fa.parallel(USJudgeRatings,fa=“pc”,n.iter=100) pc<-principal(USJudgeRatings,nfactors=1,rotate=“none”,scores=FAKSE)指定旋转方法/是否指定得分 pc<-principal(USJudgeRatings,nfactors=1,rotate=“none”,scores=TRUE)
fa.parallel(Harman23.cor$cov,n.obs=302,fa=“pc”,n.inter=100,show.legend=FALSE) 看图有几个因子在y=1以上 pc<-principal(Harman23.cor$cov,nfactors=2,rotate=“none”)
主成分的旋转 pc<-principal(Harman23.cor$cov,nfactors=2,rotate=“varimax”)
因子分析
一系列用来发现一组变量的潜在结构方法 通过寻找一组更小的、潜在的或隐藏的结构来解释已观测到的、显式的变量间的关系 本质降维
options(digits=2) covariances<-ability.cov$cov correlations<-cov2cor(covariances) fa.parallel(correlations,fa=“both”,n.obs=112,n.iter=100) fa<-fa(correlations,nfactors=2,rotate=“none”,fm=“pa”) fa<-fa(correlations,nfactors=2,rotate=“none”,fm=“pa”,score=TRUE) fa$weights fa<-fa(correlations,nfactors=2,rotate=“varimax”,fm=“pa”)正交旋转 fa.paomax<-fa(correlations,nfactors=2,rotate=“promax”,fm=“pa”)斜交旋转 factor.plot(fa.promax,labels=rownames(fa.promax$loadings)) fa.diagram(fa.varimax,simple=FALSE)
购物篮分析
arules包 data(Groceries) inspect(Groceries) fit<-apriori(Groceries,parameter=list(support=0.01,confidence=0.5)) inspect(fit)