-
Notifications
You must be signed in to change notification settings - Fork 23
/
Thursday2.R
147 lines (124 loc) · 3.82 KB
/
Thursday2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
Date: Thursday, August 9, 2012
####################################################
### Statistical Analysis of a Numerical Variable ###
####################################################
### Inference about an unknown ###
### distribution function ###
# We consider a dataset with a numerical variable X and
# observations X[1], X[2], ..., X[n]
# Data example: Positive monthly rents of students
# in StatWiSo2003.txt:
ds <- read.table(file="StatWiSo2003.txt",
header=TRUE,sep="\t")
str(ds)
# Extract rents:
X <- ds$MonMiete
X
# Get rid of zeros and NAs:
# Oh, you could put a condition there!
X <- X[X > 0 & !is.na(X)]
X
# Sort the values:
X <- sort(X)
X
# Simple summaries:
mean(X)
median(X)
# Popular graphical display: Histogram
hist(X)
hist(X,col="blue")
hist(X,col="blue", breaks=seq(0,2100, by=50))
# Graphical display: Instead of the popular but
# problematic histograms we use the empirical
# distribution function of the observations:
# Fhat(q) = (number of i with X_i <= q)/n .
# is a stepfunction!
plot.ecdf(X)
plot.ecdf(X,pch="")
plot.ecdf(X,pch="",lwd=2)
plot.ecdf(X,pch="",lwd=2,verticals=TRUE)
plot.ecdf(X,pch="",lwd=2,verticals=TRUE,
xlab="q",ylab="Fhat(q)",
main="empirical c.d.f. of monthly rents")
grid()
grid(col="red")
rug(X)
# Statistical model: The X[i] are independent copies
# of a random variable X_o with (unknown) distribution
# function F, i.e.
# Prob(X_o <= q) = F(q) for any q .
# In our explicit example: F(q) = relative proportion
# of all students in Bern (in 2003) paying rent <= q
# among all students paying rent > 0. Here X_o stands
# for the monthly rent of a randomly chosen student
# from that population.
# Illustration of the concept of empirical and
# true distribution function:
# Use a so-called gamma distribution with parameters
# shape > 0, scale > 0. The mean of such a distribution
# is shape*scale, the standard deviation equals
# sqrt(shape)*scale. With the
# monthly rent in mind, we want to achieve
# mean = 500 (CHF) and standard deviation = 250,
# so ... shape = 4, scale = 250.
# Simulate random sample from gamma(shape=4,scale=125):
n <- 100
X <- rgamma(n,shape=4,scale=125)
# Plot Fhat:
plot.ecdf(X,verticals=TRUE,pch="",lwd=2,
xlab="q",ylab="Fhat(q)",main="")
grid()
rug(X)
# Plot F:
qq <- seq(0,2000,length.out=1001)
Fqq <- pgamma(qq,shape=4,scale=125)
lines(qq,Fqq,col="green",lwd=2)
# Repeat this with different values of n ...
# Now we are a little more ambitious and want to
# first plot F and then superimpose Fhat:
plot(qq,Fqq,type="l",col="green",lwd=2,
xlab="q",ylab="F(q), Fhat(q)")
# Generate new sample:
n <- 100
X <- rgamma(n,shape=4,scale=125)
# Plot Fhat "by hand":
xx <- c(0,rep(sort(X),each=2),max(2000,max(X)))
Fhatxx <- rep((0:n)/n,each=2)
lines(xx,Fhatxx,lwd=2)
grid()
rug(X)
# Again repeat this with different values of n ...
# A little more theory:
# One can show that for any t > 0,
# Prob(|Fhat(q) - F(q)| > t for some q)
# <= 2 * exp(- 2*n*t^2) ,
# regardless of the unknown F.
# Hence with probability at least 1 - alpha,
# the maximum of |Fhat(q) - F(q)| is no larger
# than
# c(n,alpha) = sqrt(log(2/alpha) / (2*n)).
# In other words, with probability at least
# 1 - alpha, for each q, the unknown value of
# F(q) lies between
# max(Fhat(q) - c(n,alpha), 0)
# and
# min(Fhat(q) + c(n,alpha), 1)
# (Kolmogorov-Smirnov confidence band for F).
source("KSBand.R")
# Example to test your function on:
# (source("KSBand.R"))
ds <- read.table(file="StatWiSo2003.txt",
header=TRUE,sep="\t")
X <- ds$MonMiete
X <- X[X > 0 & !is.na(X)]
KSBand(X,alpha=0.05,qmin=0)
KSBand(X,alpha=0.01,qmin=0)
KSBand(X,alpha=0.1,qmin=0)
KSBand(X,alpha=0.1,qmin=0,qmax=2000)
# Illustration of the K-S-bands
# with simulated data:
qq <- seq(0,2000,length.out=1001)
Fqq <- pgamma(qq,shape=4,scale=125)
n <- 500
X <- rgamma(n,shape=4,scale=125)
KSBand(X,0.05,qmin=0,qmax=2000,qq=qq,Fqq=Fqq)