-
Notifications
You must be signed in to change notification settings - Fork 3
/
uk_introAlienZoo_analysis_EXP1.Rmd
2555 lines (1940 loc) · 139 KB
/
uk_introAlienZoo_analysis_EXP1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Introducing the Alien Zoo: Summary of evaluation and results for Exp1 (DS4 condition, March 2022)"
output:
pdf_document:
toc: TRUE
toc_depth: 5
bibliography: IntroAlienZoo.bib
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r results='asis', echo=FALSE, include=FALSE,}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE)
library(rstudioapi)
library(ggplot2)
library(ggrepel)
library(plyr)
library(dplyr)
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
library(unikn)
library(ggpubr)
library(data.table)
library(tidyverse)
library(scales)
library(effsize)
# for the lme approach:
library("emmeans")
library("sjstats")
library("lme4")
library("lmerTest")
library("MuMIn")
# turn off scientific notation for exact values
options(scipen = 999)
# Barrier-free color palette
# Source: Okabe & Ito (2008): Color Universal Design (CUD):
# Fig. 16 of <https://jfly.uni-koeln.de/color/>:
# (a) Vector of colors (as RGB values):
Okabe_Ito_palette <- c(rgb( 0, 0, 0, maxColorValue = 255), # black
rgb(230, 159, 0, maxColorValue = 255), # orange
rgb( 86, 180, 233, maxColorValue = 255), # skyblue
rgb( 0, 158, 115, maxColorValue = 255), # green
rgb(240, 228, 66, maxColorValue = 255), # yellow
rgb( 0, 114, 178, maxColorValue = 255), # blue
rgb(213, 94, 0, maxColorValue = 255), # vermillion
rgb(204, 121, 167, maxColorValue = 255) # purple
)
# (b) Vector of color names:
o_i_names <- c("black", "orange", "skyblue", "green", "yellow", "blue", "vermillion", "purple")
# (c) Use newpal() to combine colors and names:
pal_okabe_ito <- newpal(col = Okabe_Ito_palette,
names = o_i_names)
Ccol=Okabe_Ito_palette[3]
Pcol=Okabe_Ito_palette[4]
# palette for likert scale data, inspired by yellow and vermillion values from Okabe_Ito
likert_Okabe_Ito_palette <- c(rgb(213, 94, 0, maxColorValue = 255), # vermillion (strongly disagree)
rgb(234, 175, 128, maxColorValue = 255), # middle yellow red (disagree)
rgb(255, 255, 255, maxColorValue = 255), # white (neutral)
rgb(249, 245, 179, maxColorValue = 255), # medium champagne (agree)
rgb(240, 228, 66, maxColorValue = 255) # yellow (strongly agree)
)
# set an empty string to save all information
matchingRes=""
matchingRes="Comparison,ShapiroPval,TestUsed,TestPval,TestEffSize"
# source adapted wilcoxon test (easy computation of effect size)
source("uk_wilcox.test.R")
```
\newpage
# Introduction
This is an analysis of data acquired in the "Introducing the Alien Zoo" study run on Amazon mechanical turk in March 2022. In this study, naive users were asked to interact with the Alien Zoo paradigm to understand relationships in an unknown dataset, what has been termed “learning to discover” by [@adadi_peeking_2018]. In regular intervals, participants receive either counterfactual explanations (CFEs) regarding past choices, or no explanation. Computed CFEs correspond to "Control" counterfactuals that fulfill the "smallest feature change" condition [@wachter_counterfactual_2017].
This script evaluates data from Experiment 1 (three features, plant 2, 4, and 5, impacting growth rate).
# First things first: rough data cleaning
Let's first just look at the data we have. Excluding all users that had incomplete datasets, what is the turnout?
```{r echo=FALSE, warning=FALSE}
# Set working directory to source file location
sourceLoc=here::set_here()
setwd(dirname(sourceLoc))
# Where's the data
demo_source = list.files(path = "../UserData", pattern="demographics_IAZ_EXP1.csv",full.names=TRUE)
perf_source = list.files(path = "../UserData", pattern="performance_IAZ_EXP1.csv",full.names=TRUE)
rt_source = list.files(path = "../UserData", pattern="reactionTime_IAZ_EXP1.csv",full.names=TRUE)
survey_source = list.files(path = "../UserData", pattern="survey_IAZ_EXP1.csv",full.names=TRUE)
attention_source = list.files(path = "../UserData", pattern="attentionCheck_IAZ_EXP1.csv",full.names=TRUE)
# load to dframes
df_demo=read.csv(demo_source,header=TRUE)
df_perf=read.csv(perf_source,header=TRUE)
df_rt=read.csv(rt_source,header=TRUE)
df_survey=read.csv(survey_source,header=TRUE)
df_attention=read.csv(attention_source,header=TRUE)
# remove duplicated lines (double-logging happens occasionally)
df_demo=df_demo[!duplicated(df_demo),]
df_perf=df_perf[!duplicated(df_perf), ]
df_rt=df_rt[!duplicated(df_rt), ]
df_survey=df_survey[!duplicated(df_survey), ]
df_attention=df_attention[!duplicated(df_attention), ]
# truncate user IDs after 5 characters for better visualization
df_demo$userId=substr(df_demo$userId,1,5)
df_perf$userId=substr(df_perf$userId,1,5)
df_rt$userId=substr(df_rt$userId,1,5)
df_survey$userId=substr(df_survey$userId,1,5)
df_attention$userId=substr(df_attention$userId,1,5)
# group as factor, recode values to more descriptive letters
df_demo$group=as.factor(df_demo$group)
df_demo$group=recode_factor(df_demo$group,"1"="C","0"="E")
df_perf$group=as.factor(df_perf$group)
df_perf$group=recode_factor(df_perf$group,"1"="C","0"="E")
df_rt$group=as.factor(df_rt$group)
df_rt$group=recode_factor(df_rt$group,"1"="C","0"="E")
df_survey$group=as.factor(df_survey$group)
df_survey$group=recode_factor(df_survey$group,"1"="C","0"="E")
df_attention$group=as.factor(df_attention$group)
df_attention$group=recode_factor(df_attention$group,"1"="C","0"="E")
# drop fields 'X' if present (was problem with simulated data)
if("X" %in% colnames(df_perf)){
df_demo=subset(df_demo,select=-c(X))
df_perf=subset(df_perf,select=-c(X))
df_rt=subset(df_rt,select=-c(X))
df_survey=subset(df_survey,select=-c(X))
df_attention=subset(df_attention,select=-c(X))
}
# correct item number in survey df if zero based
if(0 %in% df_survey$itemNo){
df_survey$itemNo=df_survey$itemNo+1
}
# sanity check: Number of users in each df the same?
# make df with user numbers: user ID 'XXXX' appears in how many dfs?
# the value of 5 would be desireable: all IDs represented in all 5 dfs
userNo_raw <- data.frame(table(c(unique(df_perf$userId), unique(df_rt$userId), unique(df_survey$userId), unique(df_attention$userId),c(unique(df_demo$userId)))))
names(userNo_raw) <- c("Names", "Matches")
# Now, remove participants how did not finish all parts (i.e., not being part of all DFs)
df_perf=df_perf[!df_perf$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_rt=df_rt[!df_rt$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_survey=df_survey[!df_survey$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_attention=df_attention[!df_attention$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
# sort according to user ID and trial number:
df_perf=df_perf[order(df_perf$userId, df_perf$trialNo),]
df_rt=df_rt[order(df_rt$userId, df_rt$TrialNr),]
df_survey=df_survey[order(df_survey$userId, df_survey$itemNo),]
df_attention=df_attention[order(df_attention$userId, df_attention$trialNo),]
# design specs to make some things easier:
numTrials=max(df_perf$trialNo)
numAttentionTrials=2
numSurveyItems=max(df_survey$itemNo)
# number of trials per userId:
df_perf_trialsPerUser=aggregate(trialNo ~ userId, data = df_perf, FUN = function(x){NROW(x)})
df_rt_trialsPerUser=aggregate(TrialNr ~ userId, data = df_rt, FUN = function(x){NROW(x)})
df_survey_checkedItemsPerUser=aggregate(checked ~ userId, data = df_survey, FUN = function(x){sum(x==1)})
df_attention_trialsPerUser=aggregate(trialNo ~ userId, data = df_attention, FUN = function(x){NROW(x)})
# collect "odd" userIDs:
odd_userIDs=unique(c(df_perf_trialsPerUser$userId[!df_perf_trialsPerUser$trialNo==numTrials],
df_rt_trialsPerUser$userId[!df_rt_trialsPerUser$TrialNr==numTrials],
df_survey_checkedItemsPerUser$userId[df_survey_checkedItemsPerUser$checked<numSurveyItems],
df_attention_trialsPerUser$userId[!df_attention_trialsPerUser$trialNo==numAttentionTrials]))
# Manual checks showed:
## userID "5f4cc" is an odd user: no logs for item 1 and 2 -- strategy for now:
## keep that person anyway, with a note that for assessment of items 1 and 2, one person less was evaluated!
safe4now=c("5f4cc")
odd_userIDs=odd_userIDs[!odd_userIDs %in% safe4now]
# remove odd users from all dfs:
df_perf=df_perf[!df_perf$userId %in% odd_userIDs,]
df_rt=df_rt[!df_rt$userId %in% odd_userIDs,]
df_survey=df_survey[!df_survey$userId %in% odd_userIDs,]
df_attention=df_attention[!df_attention$userId %in% odd_userIDs,]
df_demo=df_demo[!df_demo$userId %in% odd_userIDs,]
# make factors
df_rt$userId <- as.factor(df_rt$userId)
df_rt$group <- as.factor(df_rt$group)
```
## General infos after removal of incomplete datasets
How many users do we have in our performance df before any cleaning (i.e., also including users with incomplete datasets)? `r length(userNo_raw$Names)`
After cleaning, we have `r length(unique(df_perf$userId))` participants. Of those,
* `r length(unique(df_perf$userId[df_perf$group=="C"]))` participants were in the control condition and
* `r length(unique(df_perf$userId[df_perf$group=="E"]))` participants in the explanation condition.
## Check covariates across groups
Additionally to assessing performance, we also acquire age and gender information of participants.
How do our groups look like? Are the groups comparable?
```{r echo=FALSE, warning=FALSE,fig.height = 4, fig.width = 3.5}
# get only age info
df_demo_age = df_demo[df_demo$item=='age',]
df_demo_gender = df_demo[df_demo$item=='gender',]
# summarize to get overview values of frequencies and percentages
df_demo_age_summary=dplyr::summarise(group_by(df_demo_age, group, item, responseNo),
SumChecks=sum(checked),
PercUsersChecked=100*(sum(checked)/length(unique(userId))))
# summarize to get overview values of frequencies and percentages
df_demo_gender_summary=dplyr::summarise(group_by(df_demo_gender, group, item, responseNo),
SumChecks=sum(checked),
PercUsersChecked=100*(sum(checked)/length(unique(userId))))
# convert to factor for proper plotting:
df_demo_age_summary$responseNo=as.factor(df_demo_age_summary$responseNo)
df_demo_gender_summary$responseNo=as.factor(df_demo_gender_summary$responseNo)
# AGE: display frequency as raw counts
CovAge_FreqUserResponses = ggplot(data=df_demo_age_summary, aes(x=responseNo,fill = group)) +
geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
labs(title="Age of participants (freq. counts)",x="", y = "Frequency of answer")+
theme_bw(base_size = 10)+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_x_discrete(breaks=1:7, labels=c("18-24y","25-34y","35-44y","45-54y","55-64y","65 and\nover","Prefer not\nto answer"))+
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))
# AGE: display frequency as percentage
CovAge_PercUserResponses = ggplot(data=df_demo_age_summary, aes(x=responseNo,fill = group)) +
geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
labs(title="Age of participants (% of users)",x="", y = "% of users")+
theme_bw(base_size = 10)+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_x_discrete(breaks=1:7, labels=c("18-24y","25-34y","35-44y","45-54y","55-64y","65 and\nover","Prefer not\nto answer"))+
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))
# put plots together
figure_CovAgeRaw <- ggarrange(CovAge_FreqUserResponses,CovAge_PercUserResponses,
ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/CovAgeRaw_distribution_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)
# show
figure_CovAgeRaw
# GENDER: display frequency as raw counts
CovGender_FreqUserResponses = ggplot(data=df_demo_gender_summary, aes(x=responseNo,fill = group)) +
geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
labs(title="Gender of participants (freq. counts)",x="", y = "Frequency of answer")+
theme_bw(base_size = 10)+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_x_discrete(breaks=1:7, labels=c("Female","Male","Transgender\nfemale","Transgender\nmale","Non-binary/\n gender non-\nconforming","Not listed","Prefer not\nto answer"))+
ylim(0, 40)+
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))
# GENDER: display frequency as percentage
CovGender_PercUserResponses = ggplot(data=df_demo_gender_summary, aes(x=responseNo,fill = group)) +
geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
labs(title="Gender of participants (% of users)",x="", y = "% of users")+
theme_bw(base_size = 10)+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_x_discrete(breaks=1:7, labels=c("Female","Male","Transgender\nfemale","Transgender\nmale","Non-binary/\n gender non-\nconforming","Not listed","Prefer not\nto answer"))+
#ylim(0, 70)+
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))
# put plots together
figure_CovGenderRaw <- ggarrange(CovGender_FreqUserResponses,CovGender_PercUserResponses,
ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/CovGenderRaw_distribution_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)
# show
figure_CovGenderRaw
```
Let's run a statistical comparison between our two groups. For age, we have ordinal data (in age bands), so we will use a non-parametric statistical test for ordinal data, that's the Wilcoxon–Mann–Whitney U test.
For gender, we need to check if data is normally distributed. If so, use a ttest, if not, we will also use the non-parametric Wilcoxon–Mann–Whitney U.
```{r echo=FALSE, warning=FALSE}
# statistical differences between groups?
# get median age per group for reporting
medianAge_C=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="C"])
medianAge_E=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="E"])
# inspect: we have 50 people per group, in which group does the 25th person lie?
# count users who 'prefered not to answer'
N_noAnswer_age=nrow(df_demo_age[df_demo_age$responseNo==7 & df_demo_age$checked==1,])
N_noAnswer_gender=nrow(df_demo_gender[df_demo_gender$responseNo==7 & df_demo_gender$checked==1,])
# remove 'prefer not to answer' entries
df_demo_age = df_demo_age[!df_demo_age$responseNo==7,]
df_demo_age_responses=df_demo_age[!df_demo_age$checked==0,]
df_demo_gender = df_demo_gender[!df_demo_gender$responseNo==7,]
df_demo_gender_responses=df_demo_gender[!df_demo_gender$checked==0,]
# Age first:
# check sample sizes, make sure they deviate not too much:
N_E_age=sum(df_demo_age_responses$group=='E')
N_C_age=sum(df_demo_age_responses$group=='C')
aget="Wilcox"
agetest=uk_wilcox.test(df_demo_age_responses$responseNo[df_demo_age_responses$group=="E"],df_demo_age_responses$responseNo[df_demo_age_responses$group=="C"],paired=FALSE,exact=FALSE)
ageeffsize=agetest$z_val/(sqrt(nrow(df_demo_age_responses)))
matchingRes=paste(matchingRes,paste("\n","AgeRaw",sep=""),"",aget,agetest$p.value,ageeffsize,sep = ",")
# Gender second:
# check sample sizes, make sure they deviate not too much:
N_E_gender=sum(df_demo_gender_responses$group=='E')
N_C_gender=sum(df_demo_gender_responses$group=='C')
# check for significant differences - "irrelevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_demo_gender_responses$responseNo) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
# if it's normal: t-test
gent="TTest"
gentest=t.test(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group,alternative="two.sided")
geneffsize=cohen.d(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group)
} else {
gent="Wilcox"
gentest=uk_wilcox.test(df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="E"],df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="C"],paired=FALSE,exact=FALSE)
geneffsize=gentest$z_val/(sqrt(nrow(df_demo_gender_responses)))
}
matchingRes=paste(matchingRes,paste("\n","GenderRaw",sep=""),shapiro$p.value,gent,gentest$p.value,geneffsize,sep = ",")
```
We acquired data from `r length(unique(df_demo$userId))` participants, with `r length(unique(df_demo$userId[df_demo$group=="C"]))` users in the control group (`r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years), and `r length(unique(df_demo$userId[df_demo$group=="E"]))` users in the explanation group (`r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years).
The analysis showed for *Age*:
* We have age information for `r N_E_age` users in the explanation and `r N_C_age` users in the control group.
* Is there a significant difference in terms of age between the groups? We compared ages of users in explanation condition and users in the control condition using a `r aget` test. This showed: U=`r agetest$statistic `, p=`r agetest$p.value `, r = `r ageeffsize `
The analysis showed for *Gender*:
* We have gender information for `r N_E_gender` users in the explanation and `r N_C_gender` users in the control group.
* Is there a significant difference in terms of gender between the groups? We compared gender distribution for users in explanation condition and users in the control condition using a `r gent` test. This showed
+ for wilcoxon test: U=`r gentest$statistic `, p=`r gentest$p.value `, r = `r geneffsize `
## Quality criteria
Before going into the hypotheses, we should apply some quality criteria to our data. Sub-quality data should be removed. The following subsections take care of such cases.
### Identify "speeders"
Speeders are people clicking through the study way too quickly to do the task properly.
Aim: identify IDs being faster than specified values (variable per game part).
This part will tag users that needed less than 2000ms to reach a feeding decision (suspiciously quick) in 4 or more trials.
```{r echo=FALSE, fig.align = "center", fig.height = 4, fig.width = 7}
# define min times for individual trial types (in ms)
min_timeAgreementScene= 1 # 1 second only? rationale: people agree really quickly, accept "speeding" here
min_timeStartScene=20000 # 20000=20s is the delay before button appears
min_timeStableUntilFeeding=2000 #2000ms = 2s
min_timeFeedbackScene=10000 # 10000=10s is current delay before button appears
# take a look: plot reaction times per participant, per trial type
# AgreementScene
pRTagreement <- ggplot(unique(df_rt[,c('userId','timeAgreementScene','group')]), aes(x=timeAgreementScene, y=0,label=userId,colour = factor(group)))+
geom_point(alpha = 0.5)+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
geom_vline(xintercept = min_timeAgreementScene,linetype="dotted",
color = "red")+
geom_text(aes(min_timeAgreementScene,0,label = min_timeAgreementScene, hjust = -0.1,vjust = -1),color = "red")+
geom_label_repel(data = . %>% mutate(lab = ifelse(timeAgreementScene <
min_timeAgreementScene, as.character(userId), "")),
aes(label = lab),
box.padding = 1,
show.legend = FALSE,max.overlaps = Inf)+
labs(title="Time spent on agreement scene",x="Time (ms)", y = "")+
theme_bw()+
theme(axis.ticks.y=element_blank(),axis.text.y=element_blank())
# StartScene (i.e., instructions)
pRTstart <- ggplot(unique(df_rt[,c('userId','timeStartScene','group')]), aes(x=timeStartScene, y=0,label=userId,colour = factor(group)))+
geom_point(alpha = 0.5)+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
geom_vline(xintercept = min_timeStartScene,linetype="dotted",
color = "red")+
geom_text(aes(min_timeStartScene,0,label = min_timeStartScene, hjust = -0.1,vjust = -1),color = "red")+
geom_label_repel(data = . %>% mutate(lab = ifelse(timeStartScene <
min_timeStartScene, as.character(userId), "")),
aes(label = lab),
box.padding = 1,
show.legend = FALSE,max.overlaps = Inf)+
labs(title="Time spent on start (instruction) scene",x="Time (ms)", y = "")+
theme_bw()+
theme(axis.ticks.y=element_blank(),axis.text.y=element_blank())
# Time needed until feeding
pRTuntilFeeding <- ggplot(df_rt[,c('userId','timeStableUntilFeeding','group','TrialNr')], aes(x=as.factor(TrialNr), y=timeStableUntilFeeding,group=userId,label=userId,colour = factor(group)))+
geom_point(alpha = 0.5)+
geom_line()+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
geom_hline(yintercept = min_timeStableUntilFeeding,linetype="dotted",
color = "red")+
geom_text(aes(0.3,min_timeStableUntilFeeding,label = min_timeStableUntilFeeding, vjust = -1),color = "red")+
geom_label_repel(data = . %>% mutate(lab = ifelse(timeStableUntilFeeding < min_timeStableUntilFeeding,as.character(userId), "")),
aes(label = lab),
box.padding = 1,
show.legend = FALSE,max.overlaps = Inf) + #this removes the 'a' from the legend
labs(title="Time needed to reach feeding decision",x="Trial", y = "Time (ms)")+
theme_bw()
# time spent on feedback scenes
pRTfeedback <- ggplot(unique(df_rt[,c('userId','timeFeedbackScene','group','BlockNr')]), aes(x=as.factor(BlockNr), y=timeFeedbackScene,group=userId,label=userId,colour = factor(group)))+
geom_point(alpha = 0.5)+
geom_line()+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
geom_hline(yintercept = min_timeFeedbackScene,linetype="dotted",
color = "red")+
geom_text(aes(0.7,min_timeFeedbackScene,label = min_timeFeedbackScene, vjust = -1),color = "red")+
geom_label_repel(data = . %>% mutate(lab = ifelse(timeFeedbackScene < min_timeFeedbackScene,as.character(userId), "")),
aes(label = lab),
box.padding = 1,
show.legend = FALSE,max.overlaps = Inf) + #this removes the 'a' from the legend
labs(title="Time needed to study feedback",x="Block", y = "Time (ms)")+
theme_bw()
# put plots together, v1
figure_RTquality_Agree_Start <- ggarrange(pRTagreement,pRTstart,
ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/DatQual_RTquality_Agree_Start_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)
# put plots together, v2
figure_RTquality_Feeding_Feedback <- ggarrange(pRTuntilFeeding, pRTfeedback,
ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/DatQual_RTquality_Feeding_Feedback_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)
# display figure
print("Display detailed RT data for different trials:")
figure_RTquality_Agree_Start
figure_RTquality_Feeding_Feedback
# get IDs of "speeders" per trialtype
speederAgreement_IDs=as.character(unique(df_rt$userId[df_rt$timeAgreementScene < min_timeAgreementScene]))
speederStart_IDs=as.character(unique(df_rt$userId[df_rt$timeStartScene < min_timeStartScene]))
speederFeeding_IDs=as.character(df_rt$userId[df_rt$timeStableUntilFeeding < min_timeStableUntilFeeding])
# duplicated once (i.e., happened in 2 trials at least):
speederFeeding_IDs=speederFeeding_IDs[duplicated(speederFeeding_IDs)]
# duplicated again (i.e., happened in 3 trials at least):
speederFeeding_IDs=speederFeeding_IDs[duplicated(speederFeeding_IDs)]
# duplicated again (i.e., happened in 4 trials at least):
speederFeeding_IDs=unique(speederFeeding_IDs[duplicated(speederFeeding_IDs)])
speederFeedback_IDs=as.character(unique(df_rt$userId[df_rt$timeFeedbackScene < min_timeFeedbackScene]))
# get all unique IDs of "speeders"
speeder_IDs=as.character(unique(c(speederAgreement_IDs,speederStart_IDs,speederFeeding_IDs,speederFeedback_IDs)))
# look at performance data of speeders:
# df_perf[df_perf$userId %in% speeder_IDs,]
```
### Identify participants failing the two attention checks
We include 2 attention checks during the game by asking participants to indicate current pack size after trials 3 and 7.
Aim: Identify IDs of users getting either one or both checks wrong; exclude those getting both wrong.
```{r echo=FALSE}
# identify correct and incorrect replies
df_attention$correctReply = df_attention$userInput == df_attention$shubNo
# obtain IDs of participants that got 1 wrong
attentionFailOne_IDs=as.character(unique(df_attention$userId[!df_attention$correctReply]))
# obtain IDs of participants that got both wrong
attentionFailBoth_IDs=as.character(df_attention$userId[!df_attention$correctReply][duplicated(df_attention$userId[!df_attention$correctReply])])
# look at performance data of non-attentive users:
# df_perf[df_perf$userId %in% attentionFailBoth_IDs,]
# Second attention check: did users detect the "red hering" question (item 7); also consider removing those who did not!
# set to data table
dt_survey=setDT(df_survey,key=c("userId"))
attentionFailSurvey = dt_survey[ checked == 1 & itemNo == 7 & !responseNo==6 ]
attentionFailSurvey_IDs=attentionFailSurvey$userId
# look at performance data of non-attentive users:
# df_perf[df_perf$userId %in% attentionFailSurvey_IDs,]
```
### Identify "straight-liners" in game part
Identify users who always give the same answer in the game part (over individual blocks, and over all blocks) DESPITE not increasing their pack size.
Aim: identify IDs of users "straight-lining" in at least two blocks, while pack size did not change (i.e., who were "immune to feedback").
```{r echo=FALSE}
# set to data table
dt_perf=setDT(df_perf,key=c("userId", "blockNo"))
# collect info on mismatched values across blocks
# mismatch = TRUE is good, meaning there is variation in input data
straightlineGame_data=merge(merge(merge(merge(merge(
dt_perf[,list(mismatchP1=length(unique(plant1))>1),keyby=.(userId,blockNo)],
dt_perf[,list(mismatchP2=length(unique(plant2))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
dt_perf[,list(mismatchP3=length(unique(plant3))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
dt_perf[,list(mismatchP4=length(unique(plant4))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
dt_perf[,list(mismatchP5=length(unique(plant5))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
dt_perf[,list(mismatchShubNoNew=length(unique(shubNoNew))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo"))
# keep only rows without any mismatches (i.e., blocks without variation in user input)
straightlineGame_data=straightlineGame_data[(!straightlineGame_data$mismatchP1) & (!straightlineGame_data$mismatchP2) & (!straightlineGame_data$mismatchP3) & (!straightlineGame_data$mismatchP4) & (!straightlineGame_data$mismatchP5) & (!straightlineGame_data$mismatchShubNoNew), ]
# count occurences of userIDs
straightlinersGame_IDs=straightlineGame_data %>% count(userId)
# keep only IDs of users straight-lining in at least 3 blocks (i.e. half the blocks)
straightlinersGame_IDs=straightlinersGame_IDs[(straightlinersGame_IDs$n > 2), ]
straightlinersGame_IDs=as.character(straightlinersGame_IDs$userId)
# look at performance data of game-straightliners:
# df_perf[df_perf$userId %in% straightlinersGame_IDs,]
```
### Identify "straight-liners" in survey part
Identify users who always give very uniform answers in the survey part.
Aim: identify IDs of users "straight-lining", i.e. giving only responses with either positive or negative valence.
```{r echo=FALSE}
# set to data table
dt_survey=setDT(df_survey,key=c("userId"))
# get checked values for items 3,4,5,6,8,9 (7 is red hering)
straightlineSurvey_data=dt_survey[ checked == 1 & itemNo > 2 & !itemNo==7]
straightlineSurvey_data$valence=ifelse(straightlineSurvey_data$responseNo>3,"pos","neg")
straightlineSurvey_data$valence[straightlineSurvey_data$responseNo==3 | straightlineSurvey_data$responseNo==6]="neut"
# identify users that answered only using positive / negative / neutral valence
straightlinersSurvey_IDs=straightlineSurvey_data[,list(mismatchValenceSurveyItems=length(unique(valence))>1),keyby=.(userId)]
# keep only users without "mismatchSurveyItems" (no mismatch = uniform answers)
straightlinersSurvey_IDs=straightlinersSurvey_IDs[!straightlinersSurvey_IDs$mismatchValenceSurveyItems]
straightlinersSurvey_IDs=as.character(straightlinersSurvey_IDs$userId)
# look at survey responses of survey-straightliners:
# df_survey[df_survey$userId %in% straightlinersSurvey_IDs & checked==1,]
```
### Remove data from problematic users
As we have identified users that seem to have dodgy data, we want to remove them.
```{r echo=FALSE}
# remove speeders from all dfs
df_perf=subset(df_perf, ! userId %in% speeder_IDs)
df_rt=subset(df_rt, ! userId %in% speeder_IDs)
df_survey=subset(df_survey, ! userId %in% speeder_IDs)
df_attention=subset(df_attention, ! userId %in% speeder_IDs)
df_demo=subset(df_demo, ! userId %in% speeder_IDs)
# remove attentionFailers (game checks), that were not already recognized as speeders
attentionFailBoth_IDs_clean=attentionFailBoth_IDs[!attentionFailBoth_IDs %in% speeder_IDs]
df_perf=subset(df_perf, ! userId %in% attentionFailBoth_IDs_clean)
df_rt=subset(df_rt, ! userId %in% attentionFailBoth_IDs_clean)
df_survey=subset(df_survey, ! userId %in% attentionFailBoth_IDs_clean)
df_attention=subset(df_attention, ! userId %in% attentionFailBoth_IDs_clean)
df_demo=subset(df_demo, ! userId %in% attentionFailBoth_IDs_clean)
# remove attentionFailers (survey check), that were not already recognized as speeders or game AFs
attentionFailSurvey_IDs_clean=attentionFailSurvey_IDs[!attentionFailSurvey_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean)]
df_perf=subset(df_perf, ! userId %in% attentionFailSurvey_IDs_clean)
df_rt=subset(df_rt, ! userId %in% attentionFailSurvey_IDs_clean)
df_survey=subset(df_survey, ! userId %in% attentionFailSurvey_IDs_clean)
df_attention=subset(df_attention, ! userId %in% attentionFailSurvey_IDs_clean)
df_demo=subset(df_demo, ! userId %in% attentionFailSurvey_IDs_clean)
# remove game straightliners, that were not already recognized as speeders / attention failers
straightlinersGame_IDs_clean=straightlinersGame_IDs[!straightlinersGame_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean,attentionFailSurvey_IDs_clean) ]
df_perf=subset(df_perf, ! userId %in% straightlinersGame_IDs_clean)
df_rt=subset(df_rt, ! userId %in% straightlinersGame_IDs_clean)
df_survey=subset(df_survey, ! userId %in% straightlinersGame_IDs_clean)
df_attention=subset(df_attention, ! userId %in% straightlinersGame_IDs_clean)
df_demo=subset(df_demo, ! userId %in% straightlinersGame_IDs_clean)
# remove survey straightliners, that were not already recognized as speeders / attention failers
straightlinersSurvey_IDs_clean=straightlinersSurvey_IDs[!straightlinersSurvey_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean,attentionFailSurvey_IDs_clean,straightlinersGame_IDs_clean) ]
df_perf=subset(df_perf, ! userId %in% straightlinersSurvey_IDs_clean)
df_rt=subset(df_rt, ! userId %in% straightlinersSurvey_IDs_clean)
df_survey=subset(df_survey, ! userId %in% straightlinersSurvey_IDs_clean)
df_attention=subset(df_attention, ! userId %in% straightlinersSurvey_IDs_clean)
df_demo=subset(df_demo, ! userId %in% straightlinersSurvey_IDs_clean)
# repeat sanity check: Number of users in each df the same?
# make df with user numbers: user ID 'XXXX' appears in how many dfs?
# the value of 4 would be desireable: all IDs represented in all 4 dfs
userNo_clean <- data.frame(table(c(as.character(unique(df_perf$userId)), as.character(unique(df_rt$userId)), as.character(unique(df_survey$userId)), as.character(unique(df_attention$userId)))))
names(userNo_clean) <- c("Names", "Matches")
```
So to summarize:
* we have `r length(userNo_raw$Names)` users to begin with
* we remove `r sum(!userNo_raw$Matches==5)` users that have incomplete datasets (aborted prematurely)
* we remove `r length(odd_userIDs)` whose information was not logged properly
** Note here: for one user, logging failed for the survey items 1 and 2 - we will keep data for this person anyway for all other measurements
* we remove `r length(speeder_IDs)` speeders
* we remove `r length(attentionFailBoth_IDs_clean)` users that failed both attention tests during the game
* we remove `r length(attentionFailSurvey_IDs_clean)` users that failed the attention test in the survey
* we remove `r length(straightlinersGame_IDs_clean)` users that straightlined in the game, despite not improving
* remove `r length(straightlinersSurvey_IDs_clean)` users that straightlined in the survey
Finally: How many users do we have in our clean performance df? `r length(unique(df_perf$userId))`
Do we have an equal number of users in each clean dataframe? `r length(unique(userNo_clean$Matches))==1`
```{r echo=FALSE}
# get age distribution after data cleaning
# summarize to get overview values of frequencies and percentages
df_demo_age_summary=dplyr::summarise(group_by(df_demo[df_demo$item=="age",], group, item, responseNo),
SumChecks=sum(checked),
PercUsersChecked=100*(sum(checked)/length(unique(userId))))
medianAge_C=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="C"])
medianAge_E=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="E"])
```
### Final, clean dataset
To sum up, in our final data we have `r length(unique(df_perf$userId))` users, with `r length(unique(df_perf$userId[df_perf$group=="C"]))` users in the control group (`r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years), and
`r length(unique(df_perf$userId[df_perf$group=="E"]))` users in the explanation group (`r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years).
Re-check: are there still no significant differences in terms of gender / age?
```{r echo=FALSE, warning=FALSE}
# statistical differences between groups?
df_demo_age=df_demo[df_demo$item=="age",]
df_demo_gender=df_demo[df_demo$item=="gender",]
# count users who 'prefered not to answer'
N_noAnswer_age=nrow(df_demo_age[df_demo_age$responseNo==7 & df_demo_age$checked==1,])
N_noAnswer_gender=nrow(df_demo_gender[df_demo_gender$responseNo==7 & df_demo_gender$checked==1,])
# remove 'prefer not to answer' entries
df_demo_age = df_demo_age[!df_demo_age$responseNo==7,]
df_demo_age_responses=df_demo_age[!df_demo_age$checked==0,]
df_demo_gender = df_demo_gender[!df_demo_gender$responseNo==7,]
df_demo_gender_responses=df_demo_gender[!df_demo_gender$checked==0,]
# Age first:
# check sample sizes, make sure they deviate not too much:
N_E_age=sum(df_demo_age_responses$group=='E')
N_C_age=sum(df_demo_age_responses$group=='C')
aget="Wilcox"
agetest=uk_wilcox.test(df_demo_age_responses$responseNo[df_demo_age_responses$group=="E"],df_demo_age_responses$responseNo[df_demo_age_responses$group=="C"],paired=FALSE,exact=FALSE)
ageeffsize=agetest$z_val/(sqrt(nrow(df_demo_age_responses)))
matchingRes=paste(matchingRes,paste("\n","AgeClean",sep=""),"",aget,agetest$p.value,ageeffsize,sep = ",")
# Gender second:
# check sample sizes, make sure they deviate not too much:
N_E_gender=sum(df_demo_gender_responses$group=='E')
N_C_gender=sum(df_demo_gender_responses$group=='C')
# check for significant differences - "irrelevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_demo_gender_responses$responseNo) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
# if it's normal: t-test
gent="TTest"
gentest=t.test(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group,alternative="two.sided")
geneffsize=cohen.d(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group)
} else {
gent="Wilcox"
gentest=uk_wilcox.test(df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="E"],df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="C"],paired=FALSE,exact=FALSE)
geneffsize=gentest$z_val/(sqrt(nrow(df_demo_gender_responses)))
}
matchingRes=paste(matchingRes,paste("\n","GenderClean",sep=""),shapiro$p.value,gent,gentest$p.value,geneffsize,sep = ",")
```
The analysis showed for *Age* in the clean dataset:
* We have age information for `r N_E_age` users in the explanation and `r N_C_age` users in the control group.
* Is there a significant difference in terms of age between the groups? We compared age of users in the explanation condition and users in the control condition using a `r aget` test. This showed: U=`r agetest$statistic `, p=`r agetest$p.value `, r = `r ageeffsize `
The analysis showed for *Gender* in the clean dataset:
* We have gender information for `r N_E_gender` users in the explanation and `r N_C_gender` users in the control group.
* Is there a significant difference in terms of gender between the groups? We compared gender distribution of users in explanation condition and users in the control condition using a `r gent` test. This showed
+ for wilcoxon test: U=`r gentest$statistic `, p=`r gentest$p.value `, r = `r geneffsize `
## Hypotheses
The main hypothesis is the following:
*H1) CFEs will help users tasked to discover unknown relationships in data. We expect this to affect objective as well as subjective understandability.*
That means, we expect users in the explanation condition to
* H1.1) perform better over time in terms of number of Shubs generated, *AND*
* H1.2) will become quicker in the final blocks, because choosing the right plants will become more automatic, *AND*
* H1.3) can more clearly state which plants were crucial for the Shubs to prosper (survey items 1 and 2).
Further, we expect:
*H2) Users will differ in terms of their subjective understanding*, specifically:
* H2.1) Users will differ in how far they found the feedback (CF-style explanation vs. overview over past choices only) useful, and in how far they could made use of it, with an advantage of providing CFEs (survey items 5, 6)
* H2.2) Users imagine that providing CFEs to be more helpful for others users, too (survey item 9).
Moreover:
*H3) We expect users in different conditions not to differ in terms of how well they understood the feedback per se, or needing support for understanding. This would be good, because it means that the added information provided does not overload the participant's cognitive capacities. (survey items 3, 4).* So this is also control to make sure groups don't differ in a weird way.
Last:
*H4) We expect timing and efficacy of how feedback was provided to be comparable (survey item 10) - a further control.* Could still be that there is a difference here, as less useful feedback (control) is perceived less efficient.
*H5) Finally, we predict that users will not have uncoverred inconsistencies in the feedback.* It would be weird for the control group; and the models were really good, and we trust CEML to do a good job. (survey item 8)
# Statistical assessment
[...] Comparisons of performance over time between users in the explanation and control conditions, respectively, are performed using R–4.1.1 [@r_core_team_r_2021]. Changes in performance over 12 trials as a measure of learning rate per group are modeled using the lme4 package v.4_1.1-27.1.
In the model testing for differences in terms of user performance, the dependent variable is number of Shubs generated. In the assessment of user's reaction time, we used time needed to reach a feeding decision in each trial as dependent variable.
The final models include the fixed effects of group, trial number and their interaction. The random-effect structure includes a by-subjects random intercept.
Advantages of using this approach include that these models account for correlations of data drawn from the same participant [@detry_analyzing_2016].
<!--The code is inspired by this 2-part tutotial: https://www.youtube.com/watch?v=AWInLxpiZuA; https://www.youtube.com/watch?v=YsD8b5KYdMw -->
Model fits are compared with the analysis of variance function of the stats package.
Effect sizes are computed in terms of $\eta_{\text{p}}^{2}$ using the effectsize package v.0.5.
Significant main effects or interactions are followed up by computing the pairwise estimated marginal means. All post-hoc analyses reported are bonerroni corrected to account for multiple comparisons.
## H1: Providing CFEs helps users
Recap the full hypothesis:
*H1) CFEs will help users tasked to discover unknown relationships in data. We expect this to affect objective as well as subjective understandability.*
That means, we expect users in the explanation condition to
* H1.1) perform better over time in terms of number of Shubs generated, *AND*
* H1.2) will become quicker in the final blocks, because choosing the right plants will become more automatic, *AND*
* H1.3) can more clearly state which plants were crucial for the Shubs to prosper (survey items 1 and 2).
### H1.1) Users in the explanation condition perform better over time in terms of number of Shubs generated.
Let's start with a first peek at the data: Descriptive stats + plotting the pack size trajectories per trial and block for each person individually (figure not shown in .pdf).
```{r echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"}
# Descriptive stats
# make group a factor
df_perf$group=as.factor(df_perf$group)
# First peek at the data, getting min / max / median:
print("First peek at the data, getting min / max / median:")
print(tapply(df_perf$shubNoNew, df_perf$group, summary))
# CHECK: What can we see here? Do groups differ wrt the range? Does one have smaller minimal values / larger maximal scores?
#Next is visual assessment: Plot scores per participant per trial and also averages over blocks (aka spaghetti plot):
# plot data per trial
H1.1_p_ShubsPerTrial <- ggplot(df_perf, aes(x=factor(trialNo), y=shubNoNew, group = userId, color= group))+
geom_point(alpha = 0.5)+
geom_line()+
#facet_wrap(vars(group),nrow = 2, ncol = 1)+
labs(title="Development of pack size by group over trials",x="Trial", y = "Pack size")+
theme_bw(base_size = 10)+
#scale_y_continuous(limits = c(0, 100))+
scale_x_discrete(breaks=1:numTrials)+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
theme(legend.position="bottom")
# prepare line plot to show sd and sem
data_summary <- function(data, varname, groupnames){
library(dplyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
SEM = sd(x[[col]], na.rm=TRUE),
sem = sd(x[[col]], na.rm=TRUE)/sqrt(length(x[[col]])))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
return(data_sum)
}
df_ShubsPerTrial_summary=data_summary(df_perf, varname="shubNoNew",groupnames=c("group","trialNo"))
# plot data per trial
H1.1_p_ShubsPerTrial_summary <- ggplot(df_ShubsPerTrial_summary, aes(x=factor(trialNo), y=mean, group = group, color= group))+
geom_point(alpha = 0.5)+
geom_line()+
geom_ribbon(aes(ymin=mean-sem, ymax=mean+sem,fill=group), linetype=2, alpha=0.1)+
#facet_wrap(vars(group),nrow = 2, ncol = 1)+
labs(title="Mean pack size by group over trials",x="Trial", y = " Mean pack size")+
theme_bw(base_size = 10)+
#scale_y_continuous(limits = c(0, 100))+
scale_x_discrete(breaks=1:numTrials)+
scale_y_continuous(limits = c(0, 75))+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
theme(legend.position="bottom")
# plot averaged data per block
df_perf_blockStats<-aggregate(shubNoNew ~ blockNo * userId + group, data=df_perf, FUN = function(x) c(mean = mean(x), SEM = sd(x), sem = sd(x)/sqrt(length(x))))
df_ShubsPerBlock_summary=data_summary(df_perf, varname="shubNoNew",groupnames=c("group","blockNo"))
H1.1_p_ShubsPerBlock <- ggplot(df_perf_blockStats, aes(x=blockNo, y=shubNoNew[,"mean"], group = userId, color= group))+
geom_point(alpha = 0.5)+
geom_line()+
geom_ribbon(aes(ymin=shubNoNew[,"mean"]-shubNoNew[,"sem"], ymax=shubNoNew[,"mean"]+shubNoNew[,"sem"],fill=group), linetype=2, alpha=0.1)+
#facet_wrap(vars(group),nrow = 2, ncol = 1)+
labs(title="Development of pack size by group over blocks",x="Block", y = "pack size")+
theme_bw(base_size = 10)+
#scale_y_continuous(limits = c(0, 100))+
scale_x_continuous(breaks=1:max(df_perf$blockNo))+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
theme(legend.position="bottom")
# plot data per trial
H1.1_p_ShubsPerBlock_summary <- ggplot(df_ShubsPerBlock_summary, aes(x=blockNo, y=mean, group = group, color= group))+
geom_point(alpha = 0.5)+
geom_line()+
geom_ribbon(aes(ymin=mean-sem, ymax=mean+sem,fill=group), linetype=2, alpha=0.1)+
#facet_wrap(vars(group),nrow = 2, ncol = 1)+
labs(title="Mean pack size by group over blocks",x="Block", y = "Mean pack size")+
theme_bw(base_size = 10)+
#scale_y_continuous(limits = c(0, 100))+
scale_x_continuous(breaks=1:max(df_ShubsPerBlock_summary$blockNo))+
scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
theme(legend.position="bottom")
# in separate facets for better visibility
H1.1_p_ShubsPerTrial_facet <- H1.1_p_ShubsPerTrial + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)
H1.1_p_ShubsPerBlock_facet <- H1.1_p_ShubsPerBlock + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)
# put all plots together
H1.1_figure1_ShubData <- ggarrange(H1.1_p_ShubsPerTrial,H1.1_p_ShubsPerBlock,
ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)
# save
ggsave("Figures/H1.1_figure1_ShubData_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)
H1.1_figure1_ShubData_summary <- ggarrange(H1.1_p_ShubsPerTrial_summary,H1.1_p_ShubsPerBlock_summary,
ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)
# save
ggsave("Figures/H1.1_figure1_ShubData_summary_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)
# # show
# print("Display figures showing development of pack size over trials / blocks:")
# #H1.1_figure1_ShubData
# H1.1_figure1_ShubData_summary
# last, make trialno a factor and show again a summary of data
df_perf$trialNo = as.factor(df_perf$trialNo)
#summary(df_perf)
```
Now on to the statistics.
```{r echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"}
# Setting up our LME model (as a 2x12 Anova, group by trial)
# We use a mixed design, with one within-subjects IV (trial) and one between subjects IV (group),
# investigating the effect of both on Shubs generated.
# Note that we add a random intercept for the participant by stating + (1|userId)
# This makes it repeated measures, as we control for the random effect of
# one person doing something multiple times.
ShubNo_effect= lmer(shubNoNew ~ trialNo*group + (1|userId), data = df_perf) # linear model DV ShubNoNew predicted by the IV (trials, i.e. time)
ShubNo_effect_anova=anova(ShubNo_effect) # show model as anova
names(ShubNo_effect_anova)=c("SumSq","MeanSq","NumDF","DenDF","Fvalue","Pvalue") # rename fields for easy access
print("ANOVA table:")
print(ShubNo_effect_anova)
ShubNo_effect_effsize=effectsize::eta_squared(ShubNo_effect,partial = TRUE) # effect size eta squared
ShubNo_effect_r2=r.squaredGLMM(ShubNo_effect) # R^2, gives 2 values: R2m = effect of IV; R2c = effect of IV and random effect
# REMEMBER: WHEN THERE IS A SINGIFICANT INTERACTION, DO NOT INTEPRET THE MAIN EFFECT ANYMORE (report it, though).
# POST-HOC:
# MAIN EFFECTS:
# so what conditions are significant, given main effects? NOT NECESSARY IN CASE OF SIGN. INTERACTION!
ShubNo_effect_posthoc_ME_time=emmeans(ShubNo_effect, list(pairwise ~ trialNo), adjust = "bonferroni") # pariwise comparisons between different trials
# estimated marginal means - first table: shows how many Shubs were produced on average per trial; based on that we can make statements of "more" / "fewer" Shubs produced
ShubNo_effect_posthoc_ME_group=emmeans(ShubNo_effect, list(pairwise ~ group), adjust = "bonferroni") # pariwise comparisons between different groups
# estimated marginal means - first table: shows how many Shubs were produced on average per group; based on that we can make statements of "more" / "fewer" Shubs produced
# get mean value for groups for reporting:
ShubNo_effect_posthoc_ME_group_C_mean=mean(df_perf[df_perf$group=="C",]$shubNoNew)
ShubNo_effect_posthoc_ME_group_C_sem=sd(df_perf[df_perf$group=="C",]$shubNoNew)/sqrt(length(df_perf[df_perf$group=="C",]$shubNoNew))
ShubNo_effect_posthoc_ME_group_E_mean=mean(df_perf[df_perf$group=="E",]$shubNoNew)
ShubNo_effect_posthoc_ME_group_E_sem=sd(df_perf[df_perf$group=="E",]$shubNoNew)/sqrt(length(df_perf[df_perf$group=="E",]$shubNoNew))
# INTERACTION:
ShubNo_effect_posthoc_INT_timeXgroup=emmeans(ShubNo_effect, list(pairwise ~ group | trialNo), adjust = "bonferroni")
ShubNo_effect_posthoc_INT_timeXgroup_effsizes=eff_size(ShubNo_effect_posthoc_INT_timeXgroup$`emmeans of group | trialNo`, sigma = sigma(ShubNo_effect), edf = Inf)
```
#### Results
The analysis revealed:
* a significant interaction (group x trials): F(`r ShubNo_effect_anova$NumDF[3]`,`r ShubNo_effect_anova$DenDF[3]`)=`r ShubNo_effect_anova$Fvalue[3] `, p=`r ShubNo_effect_anova$Pvalue[3]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[3]`
Additionally:
* there was a significant main effect of trialnumber (time): F(`r ShubNo_effect_anova$NumDF[1]`,`r ShubNo_effect_anova$DenDF[1]`)=`r ShubNo_effect_anova$Fvalue[1] `, p=`r ShubNo_effect_anova$Pvalue[1]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[1]`
* however, there was a no main effect of group: F(`r ShubNo_effect_anova$NumDF[2]`,`r ShubNo_effect_anova$DenDF[2]`)=`r ShubNo_effect_anova$Fvalue[2] `, p=`r ShubNo_effect_anova$Pvalue[2]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[2]` (mean ShubNo explanation group: `r ShubNo_effect_posthoc_ME_group_E_mean`, sem=`r ShubNo_effect_posthoc_ME_group_E_sem`; mean ShubNo control group: `r ShubNo_effect_posthoc_ME_group_C_mean`, sem=`r ShubNo_effect_posthoc_ME_group_C_sem`).
Posthoc analysis revealed significant differences betweeen groups from trial 9 onwards
(trial 9: t(56.7)=2.461, p=0.0169, Cohen's d=-1.711;
trial 10: t(56.7)=2.609, p=0.0116, Cohen's d=-1.814;
trial 11: t(56.7)=3.522, p=0.0009, Cohen's d=-2.449;
trial 12: t(56.7)=3.861, p=0.0003, Cohen's d=-2.685):
```{r echo=FALSE, fig.height = 2, fig.width = 7, fig.align = "center"}
# add statistics info to plot:
H1.1_p_ShubsPerTrial_summary_anno1 = H1.1_p_ShubsPerTrial_summary+
# vertical
geom_segment(aes(x = 9, y = 2.5, xend = 9, yend = 9),colour = "black", size=0.2) +
geom_text(x = 9, y = 20, label = "*", colour = "black")+
geom_segment(aes(x = 10, y = 2.5, xend = 10, yend = 9.5),colour = "black", size=0.2) +
geom_text(x = 10, y = 20, label = "*", colour = "black")+
geom_segment(aes(x = 11, y = 2.5, xend = 11, yend = 12),colour = "black", size=0.2) +
geom_text(x = 11, y = 22, label = "***", colour = "black")+
geom_segment(aes(x = 12, y = 2.5, xend = 12, yend = 13),colour = "black", size=0.2) +