StatisticalEvaluation/uk_introAlienZoo_analysis_EXP1.Rmd

---
title: "Introducing the Alien Zoo: Summary of evaluation and results for Exp1 (DS4 condition, March 2022)"
output: 
  pdf_document:
    toc: TRUE
    toc_depth: 5
bibliography: IntroAlienZoo.bib
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r results='asis', echo=FALSE, include=FALSE,}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE)

library(rstudioapi)
library(ggplot2)
library(ggrepel)
library(plyr)
library(dplyr)
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
library(unikn)
library(ggpubr)
library(data.table)
library(tidyverse)
library(scales)
library(effsize)

# for the lme approach:
library("emmeans")
library("sjstats")
library("lme4")
library("lmerTest")
library("MuMIn")

# turn off scientific notation for exact values
options(scipen = 999)

# Barrier-free color palette
# Source: Okabe & Ito (2008): Color Universal Design (CUD):
#         Fig. 16 of <https://jfly.uni-koeln.de/color/>:

# (a) Vector of colors (as RGB values):
Okabe_Ito_palette <- c(rgb(  0,   0,   0, maxColorValue = 255),  # black
                rgb(230, 159,   0, maxColorValue = 255),  # orange
                rgb( 86, 180, 233, maxColorValue = 255),  # skyblue
                rgb(  0, 158, 115, maxColorValue = 255),  # green
                rgb(240, 228,  66, maxColorValue = 255),  # yellow
                rgb(  0, 114, 178, maxColorValue = 255),  # blue
                rgb(213,  94,   0, maxColorValue = 255),  # vermillion
                rgb(204, 121, 167, maxColorValue = 255)   # purple
)

# (b) Vector of color names:
o_i_names <- c("black", "orange", "skyblue", "green", "yellow", "blue", "vermillion", "purple")

# (c) Use newpal() to combine colors and names:
pal_okabe_ito <- newpal(col = Okabe_Ito_palette,
                        names = o_i_names)

Ccol=Okabe_Ito_palette[3]
Pcol=Okabe_Ito_palette[4]

# palette for likert scale data, inspired by yellow and vermillion values from Okabe_Ito
likert_Okabe_Ito_palette <- c(rgb(213,  94,   0, maxColorValue = 255),  # vermillion (strongly disagree)
                rgb(234, 175, 128, maxColorValue = 255),  # middle yellow red (disagree)
                rgb(255, 255, 255, maxColorValue = 255),  # white (neutral)
                rgb(249, 245, 179, maxColorValue = 255),  # medium champagne (agree)
                rgb(240, 228,  66, maxColorValue = 255)   # yellow (strongly agree)
)

# set an empty string to save all information
matchingRes=""
matchingRes="Comparison,ShapiroPval,TestUsed,TestPval,TestEffSize"

# source adapted wilcoxon test (easy computation of effect size) 
source("uk_wilcox.test.R")

```

\newpage

# Introduction

This is an analysis of data acquired in the "Introducing the Alien Zoo" study run on Amazon mechanical turk in March 2022. In this study, naive users were asked to interact with the Alien Zoo paradigm to understand relationships in an unknown dataset, what has been termed “learning to discover” by [@adadi_peeking_2018]. In regular intervals, participants receive either counterfactual explanations (CFEs) regarding past choices, or no explanation. Computed CFEs correspond to "Control" counterfactuals that fulfill the "smallest feature change" condition [@wachter_counterfactual_2017].
This script evaluates data from Experiment 1 (three features, plant 2, 4, and 5, impacting growth rate).

# First things first: rough data cleaning

Let's first just look at the data we have. Excluding all users that had incomplete datasets, what is the turnout? 

```{r echo=FALSE, warning=FALSE}

# Set working directory to source file location
sourceLoc=here::set_here()
setwd(dirname(sourceLoc))

# Where's the data
demo_source = list.files(path = "../UserData", pattern="demographics_IAZ_EXP1.csv",full.names=TRUE)
perf_source = list.files(path = "../UserData", pattern="performance_IAZ_EXP1.csv",full.names=TRUE)
rt_source = list.files(path = "../UserData", pattern="reactionTime_IAZ_EXP1.csv",full.names=TRUE)
survey_source = list.files(path = "../UserData", pattern="survey_IAZ_EXP1.csv",full.names=TRUE)
attention_source = list.files(path = "../UserData", pattern="attentionCheck_IAZ_EXP1.csv",full.names=TRUE)

# load to dframes
df_demo=read.csv(demo_source,header=TRUE)
df_perf=read.csv(perf_source,header=TRUE)
df_rt=read.csv(rt_source,header=TRUE)
df_survey=read.csv(survey_source,header=TRUE)
df_attention=read.csv(attention_source,header=TRUE)

# remove duplicated lines (double-logging happens occasionally)
df_demo=df_demo[!duplicated(df_demo),]
df_perf=df_perf[!duplicated(df_perf), ]
df_rt=df_rt[!duplicated(df_rt), ]
df_survey=df_survey[!duplicated(df_survey), ]
df_attention=df_attention[!duplicated(df_attention), ]
  
# truncate user IDs after 5 characters for better visualization
df_demo$userId=substr(df_demo$userId,1,5)
df_perf$userId=substr(df_perf$userId,1,5)
df_rt$userId=substr(df_rt$userId,1,5)
df_survey$userId=substr(df_survey$userId,1,5)
df_attention$userId=substr(df_attention$userId,1,5)

# group as factor, recode values to more descriptive letters
df_demo$group=as.factor(df_demo$group)
df_demo$group=recode_factor(df_demo$group,"1"="C","0"="E")
df_perf$group=as.factor(df_perf$group)
df_perf$group=recode_factor(df_perf$group,"1"="C","0"="E")
df_rt$group=as.factor(df_rt$group)
df_rt$group=recode_factor(df_rt$group,"1"="C","0"="E")
df_survey$group=as.factor(df_survey$group)
df_survey$group=recode_factor(df_survey$group,"1"="C","0"="E")
df_attention$group=as.factor(df_attention$group)
df_attention$group=recode_factor(df_attention$group,"1"="C","0"="E")

# drop fields 'X' if present (was problem with simulated data)
if("X" %in% colnames(df_perf)){
df_demo=subset(df_demo,select=-c(X))
df_perf=subset(df_perf,select=-c(X))
df_rt=subset(df_rt,select=-c(X))
df_survey=subset(df_survey,select=-c(X))
df_attention=subset(df_attention,select=-c(X))
}

# correct item number in survey df if zero based
if(0 %in% df_survey$itemNo){
df_survey$itemNo=df_survey$itemNo+1
}

# sanity check: Number of users in each df the same?
# make df with user numbers: user ID 'XXXX' appears in how many dfs?
# the value of 5 would be desireable: all IDs represented in all 5 dfs
userNo_raw <- data.frame(table(c(unique(df_perf$userId), unique(df_rt$userId), unique(df_survey$userId), unique(df_attention$userId),c(unique(df_demo$userId)))))
names(userNo_raw) <- c("Names", "Matches")

# Now, remove participants how did not finish all parts (i.e., not being part of all DFs)
df_perf=df_perf[!df_perf$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_rt=df_rt[!df_rt$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_survey=df_survey[!df_survey$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]
df_attention=df_attention[!df_attention$userId %in% userNo_raw$Names[!userNo_raw$Matches==5],]

# sort according to user ID and trial number:
df_perf=df_perf[order(df_perf$userId, df_perf$trialNo),]
df_rt=df_rt[order(df_rt$userId, df_rt$TrialNr),]
df_survey=df_survey[order(df_survey$userId, df_survey$itemNo),]
df_attention=df_attention[order(df_attention$userId, df_attention$trialNo),]

# design specs to make some things easier:
numTrials=max(df_perf$trialNo)
numAttentionTrials=2
numSurveyItems=max(df_survey$itemNo)

# number of trials per userId:
df_perf_trialsPerUser=aggregate(trialNo ~ userId, data = df_perf, FUN = function(x){NROW(x)})
df_rt_trialsPerUser=aggregate(TrialNr ~ userId, data = df_rt, FUN = function(x){NROW(x)})

df_survey_checkedItemsPerUser=aggregate(checked ~ userId, data = df_survey, FUN = function(x){sum(x==1)})

df_attention_trialsPerUser=aggregate(trialNo ~ userId, data = df_attention, FUN = function(x){NROW(x)})

# collect "odd" userIDs:
odd_userIDs=unique(c(df_perf_trialsPerUser$userId[!df_perf_trialsPerUser$trialNo==numTrials],
  df_rt_trialsPerUser$userId[!df_rt_trialsPerUser$TrialNr==numTrials],
  df_survey_checkedItemsPerUser$userId[df_survey_checkedItemsPerUser$checked<numSurveyItems],
  df_attention_trialsPerUser$userId[!df_attention_trialsPerUser$trialNo==numAttentionTrials]))

# Manual checks showed: 
## userID "5f4cc" is an odd user: no logs for item 1 and 2 -- strategy for now:
## keep that person anyway, with a note that for assessment of items 1 and 2, one person less was evaluated!
safe4now=c("5f4cc")
odd_userIDs=odd_userIDs[!odd_userIDs %in% safe4now]

# remove odd users from all dfs:
df_perf=df_perf[!df_perf$userId %in% odd_userIDs,]
df_rt=df_rt[!df_rt$userId %in% odd_userIDs,]
df_survey=df_survey[!df_survey$userId %in% odd_userIDs,]
df_attention=df_attention[!df_attention$userId %in% odd_userIDs,]
df_demo=df_demo[!df_demo$userId %in% odd_userIDs,]

# make factors 
df_rt$userId <- as.factor(df_rt$userId)
df_rt$group <- as.factor(df_rt$group)

```

## General infos after removal of incomplete datasets

How many users do we have in our performance df before any cleaning (i.e., also including users with incomplete datasets)? `r length(userNo_raw$Names)`

After cleaning, we have `r length(unique(df_perf$userId))` participants. Of those, 

* `r length(unique(df_perf$userId[df_perf$group=="C"]))` participants were in the control condition and 

* `r length(unique(df_perf$userId[df_perf$group=="E"]))` participants in the explanation condition.

## Check covariates across groups

Additionally to assessing performance, we also acquire age and gender information of participants.
How do our groups look like? Are the groups comparable?

```{r echo=FALSE, warning=FALSE,fig.height = 4, fig.width = 3.5}

# get only age info
df_demo_age = df_demo[df_demo$item=='age',]
df_demo_gender = df_demo[df_demo$item=='gender',]

# summarize to get overview values of frequencies and percentages
df_demo_age_summary=dplyr::summarise(group_by(df_demo_age, group, item, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# summarize to get overview values of frequencies and percentages
df_demo_gender_summary=dplyr::summarise(group_by(df_demo_gender, group, item, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# convert to factor for proper plotting:
df_demo_age_summary$responseNo=as.factor(df_demo_age_summary$responseNo)
df_demo_gender_summary$responseNo=as.factor(df_demo_gender_summary$responseNo)

# AGE: display frequency as raw counts
CovAge_FreqUserResponses = ggplot(data=df_demo_age_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  labs(title="Age of participants (freq. counts)",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:7, labels=c("18-24y","25-34y","35-44y","45-54y","55-64y","65 and\nover","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# AGE: display frequency as percentage
CovAge_PercUserResponses = ggplot(data=df_demo_age_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  labs(title="Age of participants (% of users)",x="", y = "% of users")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:7, labels=c("18-24y","25-34y","35-44y","45-54y","55-64y","65 and\nover","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# put plots together
figure_CovAgeRaw <- ggarrange(CovAge_FreqUserResponses,CovAge_PercUserResponses,
                    ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/CovAgeRaw_distribution_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)

# show 
figure_CovAgeRaw

# GENDER: display frequency as raw counts
CovGender_FreqUserResponses = ggplot(data=df_demo_gender_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  labs(title="Gender of participants (freq. counts)",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:7, labels=c("Female","Male","Transgender\nfemale","Transgender\nmale","Non-binary/\n gender non-\nconforming","Not listed","Prefer not\nto answer"))+
  ylim(0, 40)+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# GENDER: display frequency as percentage
CovGender_PercUserResponses = ggplot(data=df_demo_gender_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  labs(title="Gender of participants (% of users)",x="", y = "% of users")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:7, labels=c("Female","Male","Transgender\nfemale","Transgender\nmale","Non-binary/\n gender non-\nconforming","Not listed","Prefer not\nto answer"))+
  #ylim(0, 70)+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# put plots together 
figure_CovGenderRaw <- ggarrange(CovGender_FreqUserResponses,CovGender_PercUserResponses,
                    ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/CovGenderRaw_distribution_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)

# show 
figure_CovGenderRaw
```

Let's run a statistical comparison between our two groups. For age, we have ordinal data (in age bands), so we will use a non-parametric statistical test for ordinal data, that's the Wilcoxon–Mann–Whitney U test.

For gender, we need to check if data is normally distributed. If so, use a ttest, if not, we will also use the non-parametric Wilcoxon–Mann–Whitney U.

```{r echo=FALSE, warning=FALSE}

# statistical differences between groups?

# get median age per group for reporting
medianAge_C=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="C"])
medianAge_E=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="E"])
# inspect: we have 50 people per group, in which group does the 25th person lie?

# count users who 'prefered not to answer'
N_noAnswer_age=nrow(df_demo_age[df_demo_age$responseNo==7 & df_demo_age$checked==1,])
N_noAnswer_gender=nrow(df_demo_gender[df_demo_gender$responseNo==7 & df_demo_gender$checked==1,])
# remove 'prefer not to answer' entries
df_demo_age = df_demo_age[!df_demo_age$responseNo==7,]
df_demo_age_responses=df_demo_age[!df_demo_age$checked==0,]

df_demo_gender = df_demo_gender[!df_demo_gender$responseNo==7,]
df_demo_gender_responses=df_demo_gender[!df_demo_gender$checked==0,]

# Age first: 

# check sample sizes, make sure they deviate not too much:
N_E_age=sum(df_demo_age_responses$group=='E')
N_C_age=sum(df_demo_age_responses$group=='C')

aget="Wilcox"
agetest=uk_wilcox.test(df_demo_age_responses$responseNo[df_demo_age_responses$group=="E"],df_demo_age_responses$responseNo[df_demo_age_responses$group=="C"],paired=FALSE,exact=FALSE)
ageeffsize=agetest$z_val/(sqrt(nrow(df_demo_age_responses)))

matchingRes=paste(matchingRes,paste("\n","AgeRaw",sep=""),"",aget,agetest$p.value,ageeffsize,sep = ",")

# Gender second: 

# check sample sizes, make sure they deviate not too much:
N_E_gender=sum(df_demo_gender_responses$group=='E')
N_C_gender=sum(df_demo_gender_responses$group=='C')

# check for significant differences - "irrelevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_demo_gender_responses$responseNo) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
  # if it's normal: t-test
  gent="TTest"
  gentest=t.test(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group,alternative="two.sided")
  geneffsize=cohen.d(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group)
} else {
  gent="Wilcox"
  gentest=uk_wilcox.test(df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="E"],df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="C"],paired=FALSE,exact=FALSE)
  geneffsize=gentest$z_val/(sqrt(nrow(df_demo_gender_responses)))
}

matchingRes=paste(matchingRes,paste("\n","GenderRaw",sep=""),shapiro$p.value,gent,gentest$p.value,geneffsize,sep = ",")

```

We acquired data from `r length(unique(df_demo$userId))` participants, with `r length(unique(df_demo$userId[df_demo$group=="C"]))` users in the control group (`r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years), and `r length(unique(df_demo$userId[df_demo$group=="E"]))` users in the explanation group (`r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years).

The analysis showed for *Age*:

* We have age information for `r N_E_age` users in the explanation and `r N_C_age` users in the control group.

* Is there a significant difference in terms of age between the groups? We compared ages of users in explanation condition and users in the control condition using a `r aget` test. This showed: U=`r agetest$statistic `, p=`r agetest$p.value `, r = `r ageeffsize `
  
The analysis showed for *Gender*:

* We have gender information for `r N_E_gender` users in the explanation and `r N_C_gender` users in the control group.

* Is there a significant difference in terms of gender between the groups? We compared gender distribution for users in explanation condition and users in the control condition using a `r gent` test. This showed

  + for wilcoxon test: U=`r gentest$statistic `, p=`r gentest$p.value `, r = `r geneffsize `

## Quality criteria

Before going into the hypotheses, we should apply some quality criteria to our data. Sub-quality data should be removed. The following subsections take care of such cases.

### Identify "speeders"

Speeders are people clicking through the study way too quickly to do the task properly.

Aim: identify IDs being faster than specified values (variable per game part).
This part will tag users that needed less than 2000ms to reach a feeding decision (suspiciously quick) in 4 or more trials.


```{r echo=FALSE, fig.align = "center", fig.height = 4, fig.width = 7}

# define min times for individual trial types (in ms)
min_timeAgreementScene= 1 # 1 second only? rationale: people agree really quickly, accept "speeding" here
min_timeStartScene=20000  # 20000=20s is the delay before button appears
min_timeStableUntilFeeding=2000 #2000ms = 2s
min_timeFeedbackScene=10000 # 10000=10s is current delay before button appears

# take a look: plot reaction times per participant, per trial type
# AgreementScene
pRTagreement <- ggplot(unique(df_rt[,c('userId','timeAgreementScene','group')]), aes(x=timeAgreementScene, y=0,label=userId,colour = factor(group)))+ 
  geom_point(alpha = 0.5)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_vline(xintercept = min_timeAgreementScene,linetype="dotted", 
                color = "red")+
  geom_text(aes(min_timeAgreementScene,0,label = min_timeAgreementScene, hjust = -0.1,vjust = -1),color = "red")+
  geom_label_repel(data = . %>% mutate(lab = ifelse(timeAgreementScene < 
    min_timeAgreementScene, as.character(userId), "")),
                  aes(label = lab), 
                  box.padding = 1,
                  show.legend = FALSE,max.overlaps = Inf)+
  labs(title="Time spent on agreement scene",x="Time (ms)", y = "")+
  theme_bw()+
  theme(axis.ticks.y=element_blank(),axis.text.y=element_blank())

# StartScene (i.e., instructions)
pRTstart <- ggplot(unique(df_rt[,c('userId','timeStartScene','group')]), aes(x=timeStartScene, y=0,label=userId,colour = factor(group)))+ 
  geom_point(alpha = 0.5)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_vline(xintercept = min_timeStartScene,linetype="dotted", 
                color = "red")+
  geom_text(aes(min_timeStartScene,0,label = min_timeStartScene, hjust = -0.1,vjust = -1),color = "red")+
  geom_label_repel(data = . %>% mutate(lab = ifelse(timeStartScene < 
    min_timeStartScene, as.character(userId), "")),
                  aes(label = lab), 
                  box.padding = 1,
                  show.legend = FALSE,max.overlaps = Inf)+
  labs(title="Time spent on start (instruction) scene",x="Time (ms)", y = "")+
  theme_bw()+
  theme(axis.ticks.y=element_blank(),axis.text.y=element_blank())

# Time needed until feeding
pRTuntilFeeding <- ggplot(df_rt[,c('userId','timeStableUntilFeeding','group','TrialNr')], aes(x=as.factor(TrialNr), y=timeStableUntilFeeding,group=userId,label=userId,colour = factor(group)))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_hline(yintercept = min_timeStableUntilFeeding,linetype="dotted", 
                color = "red")+
  geom_text(aes(0.3,min_timeStableUntilFeeding,label = min_timeStableUntilFeeding, vjust = -1),color = "red")+
  geom_label_repel(data = . %>% mutate(lab = ifelse(timeStableUntilFeeding < min_timeStableUntilFeeding,as.character(userId), "")),
                  aes(label = lab), 
                  box.padding = 1,
                  show.legend = FALSE,max.overlaps = Inf) + #this removes the 'a' from the legend
  labs(title="Time needed to reach feeding decision",x="Trial", y = "Time (ms)")+
  theme_bw()

# time spent on feedback scenes
pRTfeedback <- ggplot(unique(df_rt[,c('userId','timeFeedbackScene','group','BlockNr')]), aes(x=as.factor(BlockNr), y=timeFeedbackScene,group=userId,label=userId,colour = factor(group)))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_hline(yintercept = min_timeFeedbackScene,linetype="dotted", 
                color = "red")+
  geom_text(aes(0.7,min_timeFeedbackScene,label = min_timeFeedbackScene, vjust = -1),color = "red")+
  geom_label_repel(data = . %>% mutate(lab = ifelse(timeFeedbackScene < min_timeFeedbackScene,as.character(userId), "")),
                  aes(label = lab), 
                  box.padding = 1,
                  show.legend = FALSE,max.overlaps = Inf) + #this removes the 'a' from the legend
  labs(title="Time needed to study feedback",x="Block", y = "Time (ms)")+
  theme_bw()

# put plots together, v1
figure_RTquality_Agree_Start <- ggarrange(pRTagreement,pRTstart,
                    ncol = 1, nrow = 2, align = "v")
# save
ggsave("Figures/DatQual_RTquality_Agree_Start_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)

# put plots together, v2
figure_RTquality_Feeding_Feedback <- ggarrange(pRTuntilFeeding, pRTfeedback,
                    ncol = 1, nrow = 2, align = "v")
# save 
ggsave("Figures/DatQual_RTquality_Feeding_Feedback_IAZ_EXP1_FINAL.pdf",width = 9, height = 7,)

# display figure
print("Display detailed RT data for different trials:")
figure_RTquality_Agree_Start
figure_RTquality_Feeding_Feedback

# get IDs of "speeders" per trialtype
speederAgreement_IDs=as.character(unique(df_rt$userId[df_rt$timeAgreementScene < min_timeAgreementScene]))
speederStart_IDs=as.character(unique(df_rt$userId[df_rt$timeStartScene < min_timeStartScene]))

speederFeeding_IDs=as.character(df_rt$userId[df_rt$timeStableUntilFeeding < min_timeStableUntilFeeding])
# duplicated once (i.e., happened in 2 trials at least):
speederFeeding_IDs=speederFeeding_IDs[duplicated(speederFeeding_IDs)]
# duplicated again (i.e., happened in 3 trials at least):
speederFeeding_IDs=speederFeeding_IDs[duplicated(speederFeeding_IDs)]
# duplicated again (i.e., happened in 4 trials at least):
speederFeeding_IDs=unique(speederFeeding_IDs[duplicated(speederFeeding_IDs)])

speederFeedback_IDs=as.character(unique(df_rt$userId[df_rt$timeFeedbackScene < min_timeFeedbackScene]))

# get all unique IDs of "speeders"
speeder_IDs=as.character(unique(c(speederAgreement_IDs,speederStart_IDs,speederFeeding_IDs,speederFeedback_IDs)))

# look at performance data of speeders:
# df_perf[df_perf$userId %in% speeder_IDs,]

```

### Identify participants failing the two attention checks

We include 2 attention checks during the game by asking participants to indicate current pack size after trials 3 and 7.

Aim: Identify IDs of users getting either one or both checks wrong; exclude those getting both wrong.

```{r echo=FALSE}

# identify correct and incorrect replies
df_attention$correctReply = df_attention$userInput == df_attention$shubNo

# obtain IDs of participants that got 1 wrong
attentionFailOne_IDs=as.character(unique(df_attention$userId[!df_attention$correctReply]))

# obtain IDs of participants that got both wrong
attentionFailBoth_IDs=as.character(df_attention$userId[!df_attention$correctReply][duplicated(df_attention$userId[!df_attention$correctReply])])

# look at performance data of non-attentive users:
# df_perf[df_perf$userId %in% attentionFailBoth_IDs,]

# Second attention check: did users detect the "red hering" question (item 7); also consider removing those who did not!
# set to data table
dt_survey=setDT(df_survey,key=c("userId"))
attentionFailSurvey = dt_survey[ checked == 1 & itemNo == 7 & !responseNo==6 ]
attentionFailSurvey_IDs=attentionFailSurvey$userId

# look at performance data of non-attentive users:
# df_perf[df_perf$userId %in% attentionFailSurvey_IDs,]

```

### Identify "straight-liners" in game part

Identify users who always give the same answer in the game part (over individual blocks, and over all blocks) DESPITE not increasing their pack size.

Aim: identify IDs of users "straight-lining" in at least two blocks, while pack size did not change (i.e., who were "immune to feedback").

```{r echo=FALSE}

# set to data table
dt_perf=setDT(df_perf,key=c("userId", "blockNo"))

# collect info on mismatched values across blocks
# mismatch = TRUE is good, meaning there is variation in input data
straightlineGame_data=merge(merge(merge(merge(merge(
  dt_perf[,list(mismatchP1=length(unique(plant1))>1),keyby=.(userId,blockNo)],
  dt_perf[,list(mismatchP2=length(unique(plant2))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
  dt_perf[,list(mismatchP3=length(unique(plant3))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
  dt_perf[,list(mismatchP4=length(unique(plant4))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
  dt_perf[,list(mismatchP5=length(unique(plant5))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo")),
  dt_perf[,list(mismatchShubNoNew=length(unique(shubNoNew))>1),keyby=.(userId,blockNo)],by=c("userId", "blockNo"))

# keep only rows without any mismatches (i.e., blocks without variation in user input)
straightlineGame_data=straightlineGame_data[(!straightlineGame_data$mismatchP1) & (!straightlineGame_data$mismatchP2) & (!straightlineGame_data$mismatchP3) & (!straightlineGame_data$mismatchP4) & (!straightlineGame_data$mismatchP5) & (!straightlineGame_data$mismatchShubNoNew), ]

# count occurences of userIDs
straightlinersGame_IDs=straightlineGame_data %>% count(userId)   
# keep only IDs of users straight-lining in at least 3 blocks (i.e. half the blocks)
straightlinersGame_IDs=straightlinersGame_IDs[(straightlinersGame_IDs$n > 2), ]
straightlinersGame_IDs=as.character(straightlinersGame_IDs$userId)

# look at performance data of game-straightliners:
# df_perf[df_perf$userId %in% straightlinersGame_IDs,]

```

### Identify "straight-liners" in survey part

Identify users who always give very uniform answers in the survey part.

Aim: identify IDs of users "straight-lining", i.e. giving only responses with either positive or negative valence.

```{r echo=FALSE}

# set to data table
dt_survey=setDT(df_survey,key=c("userId"))
# get checked values for items 3,4,5,6,8,9 (7 is red hering)
straightlineSurvey_data=dt_survey[ checked == 1 & itemNo > 2 & !itemNo==7] 

straightlineSurvey_data$valence=ifelse(straightlineSurvey_data$responseNo>3,"pos","neg")
straightlineSurvey_data$valence[straightlineSurvey_data$responseNo==3 | straightlineSurvey_data$responseNo==6]="neut"

# identify users that answered only using positive / negative / neutral valence
straightlinersSurvey_IDs=straightlineSurvey_data[,list(mismatchValenceSurveyItems=length(unique(valence))>1),keyby=.(userId)]
# keep only users without "mismatchSurveyItems" (no mismatch = uniform answers)
straightlinersSurvey_IDs=straightlinersSurvey_IDs[!straightlinersSurvey_IDs$mismatchValenceSurveyItems]
straightlinersSurvey_IDs=as.character(straightlinersSurvey_IDs$userId)

# look at survey responses of survey-straightliners:
# df_survey[df_survey$userId %in% straightlinersSurvey_IDs & checked==1,]

```

### Remove data from problematic users

As we have identified users that seem to have dodgy data, we want to remove them.

```{r echo=FALSE}

# remove speeders from all dfs

df_perf=subset(df_perf, ! userId %in% speeder_IDs)
df_rt=subset(df_rt, ! userId %in% speeder_IDs)
df_survey=subset(df_survey, ! userId %in% speeder_IDs)
df_attention=subset(df_attention, ! userId %in% speeder_IDs)
df_demo=subset(df_demo, ! userId %in% speeder_IDs)

# remove attentionFailers (game checks), that were not already recognized as speeders

attentionFailBoth_IDs_clean=attentionFailBoth_IDs[!attentionFailBoth_IDs %in% speeder_IDs]

df_perf=subset(df_perf, ! userId %in% attentionFailBoth_IDs_clean)
df_rt=subset(df_rt, ! userId %in% attentionFailBoth_IDs_clean)
df_survey=subset(df_survey, ! userId %in% attentionFailBoth_IDs_clean)
df_attention=subset(df_attention, ! userId %in% attentionFailBoth_IDs_clean)
df_demo=subset(df_demo, ! userId %in% attentionFailBoth_IDs_clean)

# remove attentionFailers (survey check), that were not already recognized as speeders or game AFs
attentionFailSurvey_IDs_clean=attentionFailSurvey_IDs[!attentionFailSurvey_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean)]

df_perf=subset(df_perf, ! userId %in% attentionFailSurvey_IDs_clean)
df_rt=subset(df_rt, ! userId %in% attentionFailSurvey_IDs_clean)
df_survey=subset(df_survey, ! userId %in% attentionFailSurvey_IDs_clean)
df_attention=subset(df_attention, ! userId %in% attentionFailSurvey_IDs_clean)
df_demo=subset(df_demo, ! userId %in% attentionFailSurvey_IDs_clean)

# remove game straightliners, that were not already recognized as speeders / attention failers
straightlinersGame_IDs_clean=straightlinersGame_IDs[!straightlinersGame_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean,attentionFailSurvey_IDs_clean) ]

df_perf=subset(df_perf, ! userId %in% straightlinersGame_IDs_clean)
df_rt=subset(df_rt, ! userId %in% straightlinersGame_IDs_clean)
df_survey=subset(df_survey, ! userId %in% straightlinersGame_IDs_clean)
df_attention=subset(df_attention, ! userId %in% straightlinersGame_IDs_clean)
df_demo=subset(df_demo, ! userId %in% straightlinersGame_IDs_clean)

# remove survey straightliners, that were not already recognized as speeders / attention failers
straightlinersSurvey_IDs_clean=straightlinersSurvey_IDs[!straightlinersSurvey_IDs %in% c(speeder_IDs,attentionFailBoth_IDs_clean,attentionFailSurvey_IDs_clean,straightlinersGame_IDs_clean) ]

df_perf=subset(df_perf, ! userId %in% straightlinersSurvey_IDs_clean)
df_rt=subset(df_rt, ! userId %in% straightlinersSurvey_IDs_clean)
df_survey=subset(df_survey, ! userId %in% straightlinersSurvey_IDs_clean)
df_attention=subset(df_attention, ! userId %in% straightlinersSurvey_IDs_clean)
df_demo=subset(df_demo, ! userId %in% straightlinersSurvey_IDs_clean)

# repeat sanity check: Number of users in each df the same?
# make df with user numbers: user ID 'XXXX' appears in how many dfs?
# the value of 4 would be desireable: all IDs represented in all 4 dfs
userNo_clean <- data.frame(table(c(as.character(unique(df_perf$userId)), as.character(unique(df_rt$userId)), as.character(unique(df_survey$userId)), as.character(unique(df_attention$userId)))))
names(userNo_clean) <- c("Names", "Matches")

```

So to summarize:

* we have `r length(userNo_raw$Names)` users to begin with
* we remove `r sum(!userNo_raw$Matches==5)` users that have incomplete datasets (aborted prematurely)
* we remove `r length(odd_userIDs)` whose information was not logged properly
** Note here: for one user, logging failed for the survey items 1 and 2 - we will keep data for this person anyway for all other measurements
* we remove `r length(speeder_IDs)` speeders
* we remove `r length(attentionFailBoth_IDs_clean)` users that failed both attention tests during the game
* we remove `r length(attentionFailSurvey_IDs_clean)` users that failed the attention test in the survey
* we remove `r length(straightlinersGame_IDs_clean)` users that straightlined in the game, despite not improving
* remove `r length(straightlinersSurvey_IDs_clean)` users that straightlined in the survey

Finally: How many users do we have in our clean performance df? `r length(unique(df_perf$userId))`

Do we have an equal number of users in each clean dataframe? `r length(unique(userNo_clean$Matches))==1`

```{r echo=FALSE}
# get age distribution after data cleaning
# summarize to get overview values of frequencies and percentages
df_demo_age_summary=dplyr::summarise(group_by(df_demo[df_demo$item=="age",], group, item, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

medianAge_C=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="C"])
medianAge_E=cumsum(df_demo_age_summary$SumChecks[df_demo_age_summary$group=="E"])

```

### Final, clean dataset

To sum up, in our final data we have `r length(unique(df_perf$userId))` users, with `r length(unique(df_perf$userId[df_perf$group=="C"]))` users in the control group (`r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="C" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years), and 
`r length(unique(df_perf$userId[df_perf$group=="E"]))` users in the explanation group (`r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==1]))` female, `r length(unique(df_demo$userId[df_demo$group=="E" & df_demo$item=="gender" & df_demo$checked==1 & df_demo$responseNo==2]))` male, median age group is 35-44years).

Re-check: are there still no significant differences in terms of gender / age?

```{r echo=FALSE, warning=FALSE}

# statistical differences between groups?
df_demo_age=df_demo[df_demo$item=="age",]
df_demo_gender=df_demo[df_demo$item=="gender",]

# count users who 'prefered not to answer'
N_noAnswer_age=nrow(df_demo_age[df_demo_age$responseNo==7 & df_demo_age$checked==1,])
N_noAnswer_gender=nrow(df_demo_gender[df_demo_gender$responseNo==7 & df_demo_gender$checked==1,])
# remove 'prefer not to answer' entries
df_demo_age = df_demo_age[!df_demo_age$responseNo==7,]
df_demo_age_responses=df_demo_age[!df_demo_age$checked==0,]

df_demo_gender = df_demo_gender[!df_demo_gender$responseNo==7,]
df_demo_gender_responses=df_demo_gender[!df_demo_gender$checked==0,]

# Age first: 

# check sample sizes, make sure they deviate not too much:
N_E_age=sum(df_demo_age_responses$group=='E')
N_C_age=sum(df_demo_age_responses$group=='C')

aget="Wilcox"
agetest=uk_wilcox.test(df_demo_age_responses$responseNo[df_demo_age_responses$group=="E"],df_demo_age_responses$responseNo[df_demo_age_responses$group=="C"],paired=FALSE,exact=FALSE)
ageeffsize=agetest$z_val/(sqrt(nrow(df_demo_age_responses)))

matchingRes=paste(matchingRes,paste("\n","AgeClean",sep=""),"",aget,agetest$p.value,ageeffsize,sep = ",")

# Gender second: 

# check sample sizes, make sure they deviate not too much:
N_E_gender=sum(df_demo_gender_responses$group=='E')
N_C_gender=sum(df_demo_gender_responses$group=='C')

# check for significant differences - "irrelevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_demo_gender_responses$responseNo) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
  # if it's normal: t-test
  gent="TTest"
  gentest=t.test(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group,alternative="two.sided")
  geneffsize=cohen.d(df_demo_gender_responses$responseNo ~ df_demo_gender_responses$group)
} else {
  gent="Wilcox"
  gentest=uk_wilcox.test(df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="E"],df_demo_gender_responses$responseNo[df_demo_gender_responses$group=="C"],paired=FALSE,exact=FALSE)
  geneffsize=gentest$z_val/(sqrt(nrow(df_demo_gender_responses)))
}

matchingRes=paste(matchingRes,paste("\n","GenderClean",sep=""),shapiro$p.value,gent,gentest$p.value,geneffsize,sep = ",")

```

The analysis showed for *Age* in the clean dataset:

* We have age information for `r N_E_age` users in the explanation and `r N_C_age` users in the control group. 

* Is there a significant difference in terms of age between the groups? We compared age of users in the explanation condition and users in the control condition using a `r aget` test. This showed: U=`r agetest$statistic `, p=`r agetest$p.value `, r = `r ageeffsize `
  
The analysis showed for *Gender* in the clean dataset:

* We have gender information for `r N_E_gender` users in the explanation and `r N_C_gender` users in the control group.

* Is there a significant difference in terms of gender between the groups? We compared gender distribution of users in explanation condition and users in the control condition using a `r gent` test. This showed

  + for wilcoxon test: U=`r gentest$statistic `, p=`r gentest$p.value `, r = `r geneffsize `

## Hypotheses

The main hypothesis is the following:

*H1) CFEs will help users tasked to discover unknown relationships in data. We expect this to affect objective as well as subjective understandability.*

That means, we expect users in the explanation condition to 

* H1.1) perform better over time in terms of number of Shubs generated, *AND*

* H1.2) will become quicker in the final blocks, because choosing the right plants will become more automatic, *AND*

* H1.3) can more clearly state which plants were crucial for the Shubs to prosper (survey items 1 and 2).

Further, we expect:

*H2) Users will differ in terms of their subjective understanding*, specifically:

* H2.1) Users will differ in how far they found the feedback (CF-style explanation vs. overview over past choices only) useful, and in how far they could made use of it, with an advantage of providing CFEs (survey items 5, 6)

* H2.2) Users imagine that providing CFEs to be more helpful for others users, too (survey item 9).

Moreover:

*H3) We expect users in different conditions not to differ in terms of how well they understood the feedback per se, or needing support for understanding. This would be good, because it means that the added information provided does not overload the participant's cognitive capacities. (survey items 3, 4).* So this is also control to make sure groups don't differ in a weird way.

Last:

*H4) We expect timing and efficacy of how feedback was provided to be comparable (survey item 10) - a further control.* Could still be that there is a difference here, as less useful feedback (control) is perceived less efficient.

*H5) Finally, we predict that users will not have uncoverred inconsistencies in the feedback.* It would be weird for the control group; and the models were really good, and we trust CEML to do a good job. (survey item 8)

# Statistical assessment

[...] Comparisons of performance over time between users in the explanation and control conditions, respectively, are performed using R–4.1.1 [@r_core_team_r_2021]. Changes in performance over 12 trials as a measure of learning rate per group are modeled using the lme4 package v.4_1.1-27.1.

In the model testing for differences in terms of user performance, the dependent variable is number of Shubs generated. In the assessment of user's reaction time, we used time needed to reach a feeding decision in each trial as dependent variable.
The final models include the fixed effects of group, trial number and their interaction. The random-effect structure includes a by-subjects random intercept. 
Advantages of using this approach include that these models account for correlations of data drawn from the same participant [@detry_analyzing_2016].
<!--The code is inspired by this 2-part tutotial: https://www.youtube.com/watch?v=AWInLxpiZuA; https://www.youtube.com/watch?v=YsD8b5KYdMw -->

Model fits are compared with the analysis of variance function of the stats package.
Effect sizes are computed in terms of $\eta_{\text{p}}^{2}$ using the effectsize package v.0.5.

Significant main effects or interactions are followed up by computing the pairwise estimated marginal means. All post-hoc analyses reported are bonerroni corrected to account for multiple comparisons.

## H1: Providing CFEs helps users

Recap the full hypothesis:

*H1) CFEs will help users tasked to discover unknown relationships in data. We expect this to affect objective as well as subjective understandability.*

That means, we expect users in the explanation condition to 

* H1.1) perform better over time in terms of number of Shubs generated, *AND*

* H1.2) will become quicker in the final blocks, because choosing the right plants will become more automatic, *AND*

* H1.3) can more clearly state which plants were crucial for the Shubs to prosper (survey items 1 and 2).

### H1.1) Users in the explanation condition perform better over time in terms of number of Shubs generated.

Let's start with a first peek at the data: Descriptive stats + plotting the pack size trajectories per trial and block for each person individually (figure not shown in .pdf).

```{r echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"}

# Descriptive stats

# make group a factor
df_perf$group=as.factor(df_perf$group)

# First peek at the data, getting min / max / median:
print("First peek at the data, getting min / max / median:")
print(tapply(df_perf$shubNoNew, df_perf$group, summary))
# CHECK: What can we see here? Do groups differ wrt the range? Does one have smaller minimal values / larger maximal scores?

#Next is visual assessment: Plot scores per participant per trial and also averages over blocks (aka spaghetti plot):

# plot data per trial
H1.1_p_ShubsPerTrial <- ggplot(df_perf, aes(x=factor(trialNo), y=shubNoNew, group = userId, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Development of pack size by group over trials",x="Trial", y = "Pack size")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# prepare line plot to show sd and sem
data_summary <- function(data, varname, groupnames){
  library(dplyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      SEM = sd(x[[col]], na.rm=TRUE),
      sem = sd(x[[col]], na.rm=TRUE)/sqrt(length(x[[col]])))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  return(data_sum)
}

df_ShubsPerTrial_summary=data_summary(df_perf, varname="shubNoNew",groupnames=c("group","trialNo"))

# plot data per trial
H1.1_p_ShubsPerTrial_summary <- ggplot(df_ShubsPerTrial_summary, aes(x=factor(trialNo), y=mean, group = group, color= group))+
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=mean-sem, ymax=mean+sem,fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean pack size by group over trials",x="Trial", y = " Mean pack size")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_y_continuous(limits = c(0, 75))+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# plot averaged data per block
df_perf_blockStats<-aggregate(shubNoNew ~ blockNo * userId + group, data=df_perf, FUN = function(x) c(mean = mean(x), SEM = sd(x), sem = sd(x)/sqrt(length(x))))
df_ShubsPerBlock_summary=data_summary(df_perf, varname="shubNoNew",groupnames=c("group","blockNo"))

H1.1_p_ShubsPerBlock <- ggplot(df_perf_blockStats, aes(x=blockNo, y=shubNoNew[,"mean"], group = userId, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=shubNoNew[,"mean"]-shubNoNew[,"sem"], ymax=shubNoNew[,"mean"]+shubNoNew[,"sem"],fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Development of pack size by group over blocks",x="Block", y = "pack size")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_continuous(breaks=1:max(df_perf$blockNo))+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# plot data per trial
H1.1_p_ShubsPerBlock_summary <- ggplot(df_ShubsPerBlock_summary, aes(x=blockNo, y=mean, group = group, color= group))+
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=mean-sem, ymax=mean+sem,fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean pack size by group over blocks",x="Block", y = "Mean pack size")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_continuous(breaks=1:max(df_ShubsPerBlock_summary$blockNo))+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# in separate facets for better visibility
H1.1_p_ShubsPerTrial_facet <- H1.1_p_ShubsPerTrial + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)
H1.1_p_ShubsPerBlock_facet <- H1.1_p_ShubsPerBlock + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)

# put all plots together
H1.1_figure1_ShubData <- ggarrange(H1.1_p_ShubsPerTrial,H1.1_p_ShubsPerBlock,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H1.1_figure1_ShubData_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)

H1.1_figure1_ShubData_summary <- ggarrange(H1.1_p_ShubsPerTrial_summary,H1.1_p_ShubsPerBlock_summary,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)
# save
ggsave("Figures/H1.1_figure1_ShubData_summary_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)

# # show
# print("Display figures showing development of pack size over trials / blocks:")
# #H1.1_figure1_ShubData
# H1.1_figure1_ShubData_summary

# last, make trialno a factor and show again a summary of data
df_perf$trialNo = as.factor(df_perf$trialNo)
#summary(df_perf)

```

Now on to the statistics.

```{r echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"}

# Setting up our LME model (as a 2x12 Anova, group by trial)
# We use a mixed design, with one within-subjects IV (trial) and one between subjects IV (group),
# investigating the effect of both on Shubs generated.
# Note that we add a random intercept for the participant by stating + (1|userId)
# This makes it repeated measures, as we control for the random effect of 
# one person doing something multiple times.
ShubNo_effect= lmer(shubNoNew ~ trialNo*group + (1|userId), data = df_perf) # linear model DV ShubNoNew predicted by the IV (trials, i.e. time)

ShubNo_effect_anova=anova(ShubNo_effect)  # show model as anova
names(ShubNo_effect_anova)=c("SumSq","MeanSq","NumDF","DenDF","Fvalue","Pvalue") # rename fields for easy access
print("ANOVA table:")
print(ShubNo_effect_anova)

ShubNo_effect_effsize=effectsize::eta_squared(ShubNo_effect,partial = TRUE) # effect size eta squared
ShubNo_effect_r2=r.squaredGLMM(ShubNo_effect) # R^2, gives 2 values: R2m = effect of IV; R2c = effect of IV and random effect

# REMEMBER: WHEN THERE IS A SINGIFICANT INTERACTION, DO NOT INTEPRET THE MAIN EFFECT ANYMORE (report it, though).

# POST-HOC:
# MAIN EFFECTS:
# so what conditions are significant, given main effects? NOT NECESSARY IN CASE OF SIGN. INTERACTION!
ShubNo_effect_posthoc_ME_time=emmeans(ShubNo_effect, list(pairwise ~ trialNo), adjust = "bonferroni") # pariwise comparisons between different trials
# estimated marginal means - first table: shows how many Shubs were produced on average per trial; based on that we can make statements of "more" / "fewer" Shubs produced
ShubNo_effect_posthoc_ME_group=emmeans(ShubNo_effect, list(pairwise ~ group), adjust = "bonferroni") # pariwise comparisons between different groups
# estimated marginal means - first table: shows how many Shubs were produced on average per group; based on that we can make statements of "more" / "fewer" Shubs produced
# get mean value for groups for reporting:
ShubNo_effect_posthoc_ME_group_C_mean=mean(df_perf[df_perf$group=="C",]$shubNoNew)
ShubNo_effect_posthoc_ME_group_C_sem=sd(df_perf[df_perf$group=="C",]$shubNoNew)/sqrt(length(df_perf[df_perf$group=="C",]$shubNoNew))
ShubNo_effect_posthoc_ME_group_E_mean=mean(df_perf[df_perf$group=="E",]$shubNoNew)
ShubNo_effect_posthoc_ME_group_E_sem=sd(df_perf[df_perf$group=="E",]$shubNoNew)/sqrt(length(df_perf[df_perf$group=="E",]$shubNoNew))

# INTERACTION:
ShubNo_effect_posthoc_INT_timeXgroup=emmeans(ShubNo_effect, list(pairwise ~ group | trialNo), adjust = "bonferroni")
ShubNo_effect_posthoc_INT_timeXgroup_effsizes=eff_size(ShubNo_effect_posthoc_INT_timeXgroup$`emmeans of group | trialNo`, sigma = sigma(ShubNo_effect), edf = Inf)

```

#### Results

The analysis revealed:

* a significant interaction (group x trials): F(`r ShubNo_effect_anova$NumDF[3]`,`r ShubNo_effect_anova$DenDF[3]`)=`r ShubNo_effect_anova$Fvalue[3] `, p=`r ShubNo_effect_anova$Pvalue[3]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[3]`

Additionally:

* there was a significant main effect of trialnumber (time): F(`r ShubNo_effect_anova$NumDF[1]`,`r ShubNo_effect_anova$DenDF[1]`)=`r ShubNo_effect_anova$Fvalue[1] `, p=`r ShubNo_effect_anova$Pvalue[1]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[1]`

* however, there was a no main effect of group: F(`r ShubNo_effect_anova$NumDF[2]`,`r ShubNo_effect_anova$DenDF[2]`)=`r ShubNo_effect_anova$Fvalue[2] `, p=`r ShubNo_effect_anova$Pvalue[2]`,$\eta_{\text{p}}^{2}$=`r ShubNo_effect_effsize$Eta2_partial[2]` (mean ShubNo explanation group: `r ShubNo_effect_posthoc_ME_group_E_mean`, sem=`r ShubNo_effect_posthoc_ME_group_E_sem`; mean ShubNo control group: `r ShubNo_effect_posthoc_ME_group_C_mean`, sem=`r ShubNo_effect_posthoc_ME_group_C_sem`).

Posthoc analysis revealed significant differences betweeen groups from trial 9 onwards 
(trial 9: t(56.7)=2.461, p=0.0169, Cohen's d=-1.711;
trial 10: t(56.7)=2.609, p=0.0116, Cohen's d=-1.814;
trial 11: t(56.7)=3.522, p=0.0009, Cohen's d=-2.449;
trial 12: t(56.7)=3.861, p=0.0003, Cohen's d=-2.685):

```{r echo=FALSE, fig.height = 2, fig.width = 7, fig.align = "center"}

# add statistics info to plot:
H1.1_p_ShubsPerTrial_summary_anno1 = H1.1_p_ShubsPerTrial_summary+
  # vertical
  geom_segment(aes(x = 9, y = 2.5, xend = 9, yend = 9),colour = "black", size=0.2) +
  geom_text(x = 9,  y = 20, label = "*", colour = "black")+
  geom_segment(aes(x = 10, y = 2.5, xend = 10, yend = 9.5),colour = "black", size=0.2) +
  geom_text(x = 10,  y = 20, label = "*", colour = "black")+
  geom_segment(aes(x = 11, y = 2.5, xend = 11, yend = 12),colour = "black", size=0.2) +
  geom_text(x = 11,  y = 22, label = "***", colour = "black")+
  geom_segment(aes(x = 12, y = 2.5, xend = 12, yend = 13),colour = "black", size=0.2) +
  geom_text(x = 12,  y = 23, label = "***", colour = "black")

# save
ggsave("Figures/H1.1_p_ShubsPerTrial_summary_anno1_IAZ_EXP1_FINAL.pdf",width = 5, height = 3,)

# show
print("Display figure showing development of pack size over trials / blocks:")
H1.1_p_ShubsPerTrial_summary_anno1

```

### H1.2) Users in the explanation condition become quicker in deciding what plants to choose in the final blocks, because choice of the right plants will become more automatic.

Again, first peek at the data: Descriptive stats + plotting the RT trajectories per trial and block for each person individually (plots generated, but not shown in final .pdf).

```{r echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"}

# Descriptive stats
 
# make group a factor
df_rt$group=as.factor(df_rt$group)

# First peek at the data, getting min / max / median:
print("First peek at the data, getting min / max / median:")
print(tapply(df_rt$timeStableUntilFeeding, df_rt$group, summary))
# CHECK: What can we see here? Do groups differ wrt the range? Does one have smaller minimal values / larger maximal scores?

#Next is visual assessment: Plot scores per participant per trial and also averages over blocks (aka spaghetti plot):

# plot data per trial
H1.2_p_RTPerTrial <- ggplot(df_rt, aes(x=as.factor(TrialNr), y=timeStableUntilFeeding, group = userId, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean time needed to reach\nfeeding decision by group over trials",x="Trial", y = "Reaction time (ms)")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# prepare line plot to show sd and sem
df_RTPerTrial_summary=data_summary(df_rt, varname="timeStableUntilFeeding",groupnames=c("group","TrialNr"))

# plot data per trial
H1.2_p_RTPerTrial_summary <- ggplot(df_RTPerTrial_summary, aes(x=as.factor(TrialNr), y=mean/1000, group = group, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=mean/1000-sem/1000, ymax=mean/1000+sem/1000,fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean time for feeding decision by group over trials",x="Trial", y = " Mean time (s)")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")


# plot averaged data per block
df_rt_blockStats<-aggregate(timeStableUntilFeeding ~ BlockNr * userId + group, data=df_rt, FUN = function(x) c(mean = mean(x), SEM = sd(x), sem = sd(x)/sqrt(length(x))))

df_RTPerBlock_summary=data_summary(df_rt, varname="timeStableUntilFeeding",groupnames=c("group","BlockNr"))

H1.2_p_RTPerBlock <- ggplot(df_rt_blockStats, aes(x=as.factor(BlockNr), y=timeStableUntilFeeding[,"mean"], group = userId, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=timeStableUntilFeeding[,"mean"]-timeStableUntilFeeding[,"sem"], ymax=timeStableUntilFeeding[,"mean"]+timeStableUntilFeeding[,"sem"],fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean time needed to reach\nfeeding decision by group over blocks",x="Block", y = "Reaction time (ms)")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:max(df_perf$blockNo))+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

H1.2_p_RTPerBlock_summary <- ggplot(df_RTPerBlock_summary, aes(x=as.factor(BlockNr), y=mean, group = group, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=mean-sem, ymax=mean+sem,fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean time needed to reach\nfeeding decision by group over blocks",x="Block", y = " Mean time (s)")+
  theme_bw(base_size = 10)+
  #scale_y_continuous(limits = c(0, 100))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")

# in separate facets for better visibility
H1.2_p_RTPerTrial_facet <- H1.2_p_RTPerTrial + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)
H1.2_p_RTPerBlock_facet <- H1.2_p_RTPerBlock + facet_wrap(vars(group),nrow = 2, ncol = 1) + theme_bw(base_size = 10)

# put all plots together
H1.2_figure1_RTData <- ggarrange(H1.2_p_RTPerTrial,H1.2_p_RTPerBlock,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)
# save
ggsave("Figures/H1.2_figure1_RTData_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)

# put all plots together
H1.2_figure1_RTData_summary <- ggarrange(H1.2_p_RTPerTrial_summary,H1.2_p_RTPerBlock_summary,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)
# save
ggsave("Figures/H1.2_figure1_RTData_summary_IAZ_EXP1_FINAL.pdf",width = 5, height = 4,)

# # show
# print("Display figures showing development of reaction times over trials / blocks:")
# H1.2_figure1_RTData_summary

# last, make trialno a factor and show again a summary of data
df_rt$TrialNr = as.factor(df_rt$TrialNr)
#summary(df_rt)

```

Now on to the statistics.

```{r echo=FALSE}

# Setting up our LME model (as a 2x15 Anova, group by trial)
# mixed design, with one within-subjects IV (trial) and one between subjects IV (group),
# investigating the effect of reaction times.
# Note that we add a random intercept for the participant by stating + (1|userId)
# This makes it repeated measures, as we control for the random effect of 
# one person doing something multiple times.
RT_effect= lmer(timeStableUntilFeeding ~ group*TrialNr + (1|userId), data = df_rt) # linear model DV ShubNoNew predicted by the IV (trials, i.e. time)

RT_effect_anova=anova(RT_effect)  # show model as anova
names(RT_effect_anova)=c("SumSq","MeanSq","NumDF","DenDF","Fvalue","Pvalue") # rename fields for easy access
print("ANOVA table:")
print(RT_effect_anova)

RT_effect_effsize=effectsize::eta_squared(RT_effect,partial = TRUE) # effect size eta squared
RT_effect_r2=r.squaredGLMM(RT_effect) # R^2, gives 2 values: R2m = effect of IV; R2c = effect of IV and random effect

# REMEMBER: WHEN THERE IS A SINGIFICANT INTERACTION, DO NOT INTEPRET THE MAIN EFFECT ANYMORE (report it, though).

# POST-HOC:
# MAIN EFFECTS:
# so what conditions are significant, given main effects? NOT NECESSARY IN CASE OF SIGN. INTERACTION!
RT_effect_posthoc_ME_time=emmeans(RT_effect, list(pairwise ~ TrialNr), adjust = "bonferroni") # pariwise comparisons between different trials
# estimated marginal means - first table: average reaction times per trial; based on that we can make statements of "faster" / "slower" trials
RT_effect_posthoc_ME_time_effsizes=eff_size(RT_effect_posthoc_ME_time$`emmeans of TrialNr`, sigma = sigma(RT_effect), edf = Inf)

RT_effect_posthoc_ME_group=emmeans(RT_effect, list(pairwise ~ group), adjust = "bonferroni") # pariwise comparisons between different groups
# estimated marginal means - first table: average reaction times per group; based on that we can make statements of "faster" / "slower" groups
# get mean value for groups for reporting:
RT_effect_posthoc_ME_group_C_mean=mean(df_rt[df_rt$group=="C",]$timeStableUntilFeeding)
RT_effect_posthoc_ME_group_C_sem=sd(df_rt[df_rt$group=="C",]$timeStableUntilFeeding)/sqrt(length(df_rt[df_rt$group=="C",]$timeStableUntilFeeding))
RT_effect_posthoc_ME_group_E_mean=mean(df_rt[df_rt$group=="E",]$timeStableUntilFeeding)
RT_effect_posthoc_ME_group_E_sem=sd(df_rt[df_rt$group=="E",]$timeStableUntilFeeding)/sqrt(length(df_rt[df_rt$group=="E",]$timeStableUntilFeeding))

# INTERACTION:
# RT_effect_posthoc_INT_timeXgroup=emmeans(RT_effect, list(pairwise ~ group | TrialNr), adjust = "bonferroni")

```

The analysis revealed:

* There was a highly significant main effect of trials (time): F(`r RT_effect_anova$NumDF[2]`,`r RT_effect_anova$DenDF[2]`)=`r RT_effect_anova$Fvalue[2] `, p=`r RT_effect_anova$Pvalue[2]`,$\eta_{\text{p}}^{2}$=`r RT_effect_effsize$Eta2_partial[2]`

The other main effect of group did not reach significance
* ... and also a significant main effect of group: F(`r RT_effect_anova$NumDF[1]`,`r RT_effect_anova$DenDF[1]`)=`r RT_effect_anova$Fvalue[1] `, p=`r RT_effect_anova$Pvalue[1]`,$\eta_{\text{p}}^{2}$=`r RT_effect_effsize$Eta2_partial[1]` (mean RT explanation group: `r RT_effect_posthoc_ME_group_E_mean/1000`s, sem=`r RT_effect_posthoc_ME_group_E_sem/1000`s; mean RT control group: `r RT_effect_posthoc_ME_group_C_mean/1000`s, sem=`r RT_effect_posthoc_ME_group_C_sem/1000`s).

* ... and neither did the interaction::

* interaction (group x trials): F(`r RT_effect_anova$NumDF[3]`,`r RT_effect_anova$DenDF[3]`)=`r RT_effect_anova$Fvalue[3] `, p=`r RT_effect_anova$Pvalue[3]`,$\eta_{\text{p}}^{2}$=`r RT_effect_effsize$Eta2_partial[3]`

Post-hoc analysis of the main effect of trial showed significant differences between 

* trial 1 and all other trials (all t(407)>=5.189, p<.0001, Cohen's d=1.17536 or higher).
* between trial 4 and trial 5 (t(407)=3.755, p=0.0131, Cohen's d=0.85056)
* between trial 4 and trial 7 (t(407)=4.020, p=0.0046, Cohen's d=0.91069)
* between trial 4 and trial 10 (t(407)=3.397, p=0.0494, Cohen's d=0.76949)
* between trial 4 and trial 11 (t(407)=3.537, p=0.0298, Cohen's d=0.80129)

```{r echo=FALSE, fig.height = 2, fig.width = 7, fig.align = "center"}

# add statistics info to plot:

# plot data per trial
H1.2_p_RTPerTrial_summary_anno <- ggplot(df_RTPerTrial_summary, aes(x=as.factor(TrialNr), y=mean/1000, group = group, color= group))+ 
  geom_point(alpha = 0.5)+
  geom_line()+
  geom_ribbon(aes(ymin=mean/1000-sem/1000, ymax=mean/1000+sem/1000,fill=group), linetype=2, alpha=0.1)+
  #facet_wrap(vars(group),nrow = 2, ncol = 1)+
  labs(title="Mean time needed to reach feeding decision by group over trials",x="Trial", y = " Mean time (s)")+
  theme_bw(base_size = 10)+
  scale_y_continuous(limits = c(5000/1000, 37000/1000))+
  scale_x_discrete(breaks=1:numTrials)+
  scale_colour_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(legend.position="bottom")+
  # FIRST COMPARISONS: trial 1 vs. all other trials (all p<0.0001)
  geom_segment(aes(x = 1, y = 32000/1000, xend = 1, yend = 35500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 2, y = 32000/1000, xend = 2, yend = 34000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 12, y = 32000/1000, xend = 12, yend = 34000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 2, y = 34000/1000, xend = 12, yend = 34000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 7, y = 34000/1000, xend = 7, yend = 35500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 1, y = 35500/1000, xend = 7, yend = 35500/1000),colour = "black", size=0.2) +
  geom_text(x = 4,  y = 36000/1000, label = "***", colour = "black")+
  # SECOND EFFECTS: trial 4 vs. trial 5 (p=0.0131) 
  geom_segment(aes(x = 4, y = 20000/1000, xend = 4, yend = 22000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 5, y = 20000/1000, xend = 5, yend = 22000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 4, y = 22000/1000, xend = 5, yend = 22000/1000),colour = "black", size=0.2) +
  geom_text(x = 4.5,  y = 22500/1000, label = "*", colour = "black")+
  # SECOND EFFECTS: trial 4 vs. trial 7 (p=0.0242) 
  geom_segment(aes(x = 4, y = 22500/1000, xend = 4, yend = 24500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 7, y = 22500/1000, xend = 7, yend = 24500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 4, y = 24500/1000, xend = 7, yend = 24500/1000),colour = "black", size=0.2) +
  geom_text(x = 5.5,  y = 25000/1000, label = "*", colour = "black")+
  # SECOND EFFECTS: trial 4 vs. trial 10 (p=0.0494) 
  geom_segment(aes(x = 4, y = 25000/1000, xend = 4, yend = 27000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 10, y = 25000/1000, xend = 10, yend = 27000/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 4, y = 27000/1000, xend = 10, yend = 27000/1000),colour = "black", size=0.2) +
  geom_text(x = 7,  y = 27500/1000, label = "*", colour = "black")+
  # SECOND EFFECTS: trial 4 vs. trial 11 (p=0.0298) 
  geom_segment(aes(x = 4, y = 27500/1000, xend = 4, yend = 29500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 11, y = 27500/1000, xend = 11, yend = 29500/1000),colour = "black", size=0.2) +
  geom_segment(aes(x = 4, y = 29500/1000, xend = 11, yend = 29500/1000),colour = "black", size=0.2) +
  geom_text(x = 7.5,  y = 30000/1000, label = "*", colour = "black")

# save
ggsave("Figures/H1.2_p_RTPerTrial_summary_anno_IAZ_EXP1_FINAL.pdf",width = 5, height = 3,)

# show
print("Display figures showing development of reaction times over trials / blocks:")
H1.2_p_RTPerTrial_summary_anno

```


### H1.3) Users in the explanation condition can more clearly state which plants were crucial for the Shubs to prosper (survey items 1 and 2)

survey items 1 and 2 explicitly ask users to state which plants they thought were relevant.
So what did users tick?

```{r echo=FALSE, fig.height = 5, fig.width = 4, fig.align = "center"}

# ADAPT depending on data used: Vector specifying which features are relevant
# [LAST ENTRY ALWAYS 0, to account for 'don't know' response option in survey]
# Experiment 1: plants 2, 4, and 5 were relevant:
# there was a linear relationship between plant 2 and the growth rate, iff
#  + plant 4 has a value of 1 or 2 AND
#  + plant 5 is not smaller than 4

relevantPlants_truth=c(2,4,5)
irrelevantPlants_truth=c(1,3)

# Descriptive stats
# subset to get only relevant data
df_survey_relevantPlants=df_survey[(df_survey$itemNo==1 | df_survey$itemNo==2),]
# set itemNo and responseNo to factors, too
df_survey_relevantPlants$itemNo=as.factor(df_survey_relevantPlants$itemNo)
df_survey_relevantPlants$responseNo=as.factor(df_survey_relevantPlants$responseNo)

summary(df_survey_relevantPlants)

# summarize to get overview values of frequencies and percentages
df_survey_relevantPlants_summary=dplyr::summarise(group_by(df_survey_relevantPlants, group, itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# rectangle data for figure annotation
rect_df_freq<-data.frame(xmin = c(1.5, 3.5, 4.5, 0.5, 2.5), xmax = c(2.5, 4.5, 5.5, 1.5, 3.5), 
                 ymin = c(0, 0, 0, 0, 0), ymax = c(25, 25, 25, 25, 25), 
                 itemNo = as.character(c(1,1,1,2,2)))

# rectangle data for figure annotation
rect_df_perc<-data.frame(xmin = c(1.5, 3.5, 4.5, 0.5, 2.5), xmax = c(2.5, 4.5, 5.5, 1.5, 3.5), 
                 ymin = c(0, 0, 0, 0, 0), ymax = c(101, 101, 101, 101, 101), 
                 itemNo = as.character(c(1,1,1,2,2)))

# display frequency as raw counts
H1.3_p_FreqUserResponses = ggplot(data=df_survey_relevantPlants_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`1`="What plants are relevant?",`2`="What plants are not relevant?")))+
  labs(title="Judged relevance of plants (freq. counts)",x="Plant number", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("1","2","3","4","5","don't know"))+
  scale_y_continuous(limits = c(0,25))+
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  alpha = 0.2,colour ="red",data = rect_df_freq,inherit.aes = FALSE)+
  theme(plot.title = element_text(hjust = 0.5))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
H1.3_p_PercUserResponses = ggplot(data=df_survey_relevantPlants_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`1`="What plants are relevant?",`2`="What plants are not relevant?")))+
  labs(title="Judged relevance of plants (% of users)",x="Plant number", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("1","2","3","4","5","don't know"))+
  scale_y_continuous(limits = c(0,101))+
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  alpha = 0.2,colour ="red",data = rect_df_perc,inherit.aes = FALSE)+
  theme(plot.title = element_text(hjust = 0.5),legend.title = element_blank())

# put all plots together
H1.3_figure1_UserResponses <- ggarrange(H1.3_p_FreqUserResponses,H1.3_p_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H1.3_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# # show
# print("Display figures showing user responses in relevant survey items:")
# H1.3_figure1_UserResponses

```

How to evaluate this statistically? Let's just count the matches between 'judged as relevant' / 'judged as irrelevant' user vectors and the true 'relevant' / 'irrelevant' factors.

```{r echo=FALSE, fig.height = 5, fig.width = 5, fig.align = "center"}

# define vectors with ground truth
relevantPlants_truth_vec=c(0,0,0,0,0)
relevantPlants_truth_vec[relevantPlants_truth]=1

# invert relevantPlants_truth_vec to get irrelevant vector
irrelevantPlants_truth_vec=as.numeric(!relevantPlants_truth_vec)

# go from long to wide format:
df_survey_relevantPlants_wide = spread(df_survey_relevantPlants[df_survey_relevantPlants$itemNo==1,],responseNo,checked)
df_survey_irrelevantPlants_wide = spread(df_survey_relevantPlants[df_survey_relevantPlants$itemNo==2,],responseNo,checked)

# define function to compute sum of matches between input and truth
getMatches <- function(x, truth, output) {
 return(sum(c(x[4],x[5],x[6],x[7],x[8])==truth))
}

# compute matches between participant input and truth
matchesRel=apply(df_survey_relevantPlants_wide,1,getMatches,truth=relevantPlants_truth_vec)
matchesIrrel=apply(df_survey_irrelevantPlants_wide,1,getMatches,truth=irrelevantPlants_truth_vec)
df_survey_relevantPlants_wide=cbind(df_survey_relevantPlants_wide,matches = matchesRel)
df_survey_irrelevantPlants_wide=cbind(df_survey_irrelevantPlants_wide,matches = matchesIrrel)

# plot mean number of matches between groups
# function to compute summary stats (mean, sd, sem)
data_summary <- function(data, varname, groupnames){
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      SEM = sd(x[[col]], na.rm=TRUE),
      sem = sd(x[[col]], na.rm=TRUE)/sqrt(length(x[[col]])))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum$itemNo=unique(data$itemNo)
  return(data_sum)
}

# compute summary stats (mean, sd, sem)
df_survey_relevantPlants_summary=data_summary(df_survey_relevantPlants_wide, varname="matches", 
                    groupnames=c("group"))

df_survey_irrelevantPlants_summary=data_summary(df_survey_irrelevantPlants_wide, varname="matches", 
                    groupnames=c("group"))

df_survey_relevantPlants_plot=rbind(df_survey_relevantPlants_summary,df_survey_irrelevantPlants_summary)

# plot means and sems
H1.3_figure2_UserResponseMatches = ggplot(df_survey_relevantPlants_plot, aes(x=group, y=mean, fill=group)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), width=.2,
                 position=position_dodge(.9))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`1`="Which plants are relevant?",`2`="Which plants are not relevant?")))+
  labs(title="Mean no. of matches (judgements vs. truth)",x="Group", y = "Mean number of matches")+
  theme_bw(base_size = 10)+
  scale_y_continuous(expand = c(0,0),limits = c(0,5.5),breaks=c(1:5))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_hline(aes(yintercept=5, linetype="Max. no.\nof possible\nmatches"), color = "black", size=0.5)+
  scale_linetype_manual(name = "", values = c(5, 3), 
                      guide = guide_legend(override.aes = list(color = c("black"))))+
  theme(plot.title = element_text(hjust = 0.5),legend.position = "bottom",legend.box="vertical")

# save
ggsave("Figures/H1.3_figure2_UserResponseMatches_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show plot
print("Mean number of matches between user judgements and ground truth for relevant and irrelevant plants:")
H1.3_figure2_UserResponseMatches

# check for significant differences - "relevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_survey_relevantPlants_wide$matches) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
  # if it's normal: t-test
  item1t="TTest"
  item1test=t.test(df_survey_relevantPlants_wide$matches ~ df_survey_relevantPlants_wide$group,alternative="two.sided")
  item1effsize=cohen.d(df_survey_relevantPlants_wide$matches ~ df_survey_relevantPlants_wide$group)
} else {
  item1t="Wilcox"
  item1test=uk_wilcox.test(df_survey_relevantPlants_wide$matches[df_survey_relevantPlants_wide$group=="E"],df_survey_relevantPlants_wide$matches[df_survey_relevantPlants_wide$group=="C"],paired=FALSE,exact=FALSE)
  item1effsize=item1test$z_val/(sqrt(nrow(df_survey_relevantPlants_wide)))
}
matchingRes=paste(matchingRes,paste("\n","H1.3_UserJudgementsRelevPlants",sep=""),shapiro$p.value,item1t,item1test$p.value,item1effsize,sep = ",")

# check for significant differences - "irrelevant plans" judgements:
# check if sample is normally distributed using shapiro test
shapiro=shapiro.test(df_survey_irrelevantPlants_wide$matches) # if p-value is lower than 0.05, you can conclude that the sample deviates from normality
if(shapiro$p.value > 0.05){
  # if it's normal: t-test
  item2t="TTest"
  item2test=t.test(df_survey_irrelevantPlants_wide$matches ~ df_survey_irrelevantPlants_wide$group,alternative="two.sided")
  item2effsize=cohen.d(df_survey_irrelevantPlants_wide$matches ~ df_survey_irrelevantPlants_wide$group)
} else {
  item2t="Wilcox"
  item2test=uk_wilcox.test(df_survey_irrelevantPlants_wide$matches[df_survey_irrelevantPlants_wide$group=="E"],df_survey_irrelevantPlants_wide$matches[df_survey_irrelevantPlants_wide$group=="C"],paired=FALSE,exact=FALSE)
  item2effsize=item2test$z_val/(sqrt(nrow(df_survey_irrelevantPlants_wide)))
}

matchingRes=paste(matchingRes,paste("\n","H1.3_UserJudgementsIrrelevPlants",sep=""),shapiro$p.value,item2t,item2test$p.value,item2effsize,sep = ",")

```

The analysis revealed:

* Is there a significant difference in terms of matches between plants judged as relevant and ground truth?: We compared number of matches for users in explanation condition (M = `r df_survey_relevantPlants_summary$mean[df_survey_relevantPlants_summary$group=="E"]`, SEM = `r df_survey_relevantPlants_summary$sem[df_survey_relevantPlants_summary$group=="E"]`) and users in the control condition (M = `r df_survey_relevantPlants_summary$mean[df_survey_relevantPlants_summary$group=="C"]`, SEM = `r df_survey_relevantPlants_summary$sem[df_survey_relevantPlants_summary$group=="C"]`) using a `r item1t` test. This showed

  + for wilcoxon test: U=`r item1test$statistic `, p=`r item1test$p.value `, r = `r item1effsize `
  + So YES! People receiving explanations could better identify relevant features.
  + Note: a large proportion on the control group replied with "I don't know", also indicating the troubles these participants had.

* Is there a significant difference in terms of matches between plants judged as irrelevant and ground truth?: We compared number of matches for users in explanation condition (M = `r df_survey_irrelevantPlants_summary$mean[df_survey_irrelevantPlants_summary$group=="E"]`, SEM = `r df_survey_irrelevantPlants_summary$sem[df_survey_irrelevantPlants_summary$group=="E"]`) and users in the control condition (M = `r df_survey_irrelevantPlants_summary$mean[df_survey_irrelevantPlants_summary$group=="C"]`, SEM = `r df_survey_irrelevantPlants_summary$sem[df_survey_irrelevantPlants_summary$group=="C"]`) using a `r item2t` test. This showed

  + for wilcoxon test: U=`r item2test$statistic `, p=`r item2test$p.value `, r = `r item2effsize `

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# Add significance notation to plot:

H1.3_figure2_UserResponseMatches_anno = ggplot(df_survey_relevantPlants_plot, aes(x=group, y=mean, fill=group)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), width=.2,
                 position=position_dodge(.9))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`1`="Which plants are relevant?",`2`="Which plants are not relevant?")))+
  labs(title="Mean no. of matches (judgements vs. truth)",x="Group", y = "Mean number of matches")+
  theme_bw(base_size = 10)+
  scale_y_continuous(expand = c(0,0),limits = c(0,5.5),breaks=c(1:5))+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  geom_hline(aes(yintercept=5, linetype="Max. no.\nof possible\nmatches"), color = "black", size=0.5)+
  scale_linetype_manual(name = "", values = c(5, 3), 
                      guide = guide_legend(override.aes = list(color = c("black"))))+
  theme(plot.title = element_text(hjust = 0.5),legend.position = "bottom",axis.ticks.x=element_blank(),axis.text.x=element_blank())+
  geom_segment(aes(x = 1, y = 2.75, xend = 1, yend = 4),colour = "black", size=0.2) +
  geom_segment(aes(x = 2, y = 3.5, xend = 2, yend = 4),colour = "black", size=0.2) +
  geom_segment(aes(x = 1, y = 4, xend = 2, yend = 4),colour = "black", size=0.2) +
  geom_text(x = 1.5,  y = 4.25, label = "**", colour = "black")

# save
ggsave("Figures/H1.3_figure2_UserResponseMatches_anno_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing matches between user responses and ground truth in relevant survey items:")
H1.3_figure2_UserResponseMatches_anno

```

## H1) Final plot for publication

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# put all three H1 figures to one for publication

ggarrange(H1.1_p_ShubsPerTrial_summary_anno1,H1.2_p_RTPerTrial_summary_anno,ncol=2,nrow=1, common.legend = TRUE, legend = "bottom", labels = c("a", "b"), font.label = list(size = 16))

# remove individual titles before putting all plots together:
H1.1_p_ShubsPerTrial_summary_anno1$labels$title=c()
H1.2_p_RTPerTrial_summary_anno$labels$title=c()
H1.3_figure2_UserResponseMatches_anno$labels$title=c()
H1.3_figure2_UserResponseMatches_anno$labels$x=c()

p1 <- ggarrange("",H1.1_p_ShubsPerTrial_summary_anno1,"",H1.2_p_RTPerTrial_summary_anno,ncol=2,nrow=2, common.legend = TRUE, legend = "bottom", labels = c("a","", "b",""), label.x=-0.001,label.y=1.04,font.label = list(size = 16),align = c("v"),widths=c(4,95))

H1.3tmp <- H1.3_figure2_UserResponseMatches_anno+theme(legend.title = element_blank(),legend.margin=margin(t = -0.25, unit='cm'))
H1.3tmp = H1.3tmp+theme(axis.title.x = element_blank())

p2 <- ggarrange(H1.3tmp, 
                ncol = 1, nrow = 1,legend = "bottom", labels = c("c"),label.x=-0.001,label.y=1.02,font.label = list(size = 16))

H1_p_ShubsPerTrial_RTPerTrial = ggarrange(p1, "", p2, ncol = 3, nrow = 1, widths=c(65,5,35),common.legend = TRUE, legend = "bottom")

# save
ggsave("Figures/H1_p_ShubsPerTrial_RTPerTrial_IAZ_EXP1_FINAL.pdf",width = 8,height = 3.5) #width = 7, height = 2,

# display big summary figure:
H1_p_ShubsPerTrial_RTPerTrial

```

## H2) User differences in terms of subjective understanding

Recap the full hypothesis:

H2) Users will differ in terms of their subjective understanding, specifically:

* H2.1) Users will differ in how far they found the explanations useful, and in how far they could make use of it, with an advantage of providing CFEs (survey items 5, 6)

* H2.2) Users imagine CFEs to be more helpful for others users, too (survey item 9).

### H2.1) Users will differ in how far they found the explanations useful, and in how far they could made use of it, with an advantage of providing CFEs (survey items 5, 6)

Diving more into survey results.

Item 5: "I found that the feedback on what choice would have led to a better result helped me to increase the number of Shubs."

Item 6: "I was able to use the feedback based on what choice would have led to a better result to increase the number of Shubs."

We will talk about these as quantifying how subjectively helpful (item 5) and how usable (item 6) they were.

```{r echo=FALSE, fig.height = 6, fig.width = 4, fig.align = "center"}

# Descriptive stats
# subset to get only relevant data
df_survey_subjUsability=df_survey[(df_survey$itemNo==5 | df_survey$itemNo==6),]

# set itemNo and responseNo to factors, too
df_survey_subjUsability$itemNo=as.factor(df_survey_subjUsability$itemNo)
df_survey_subjUsability$responseNo=as.factor(df_survey_subjUsability$responseNo)

summary(df_survey_subjUsability)

# summarize to get overview values of frequencies and percentages
df_survey_subjUsability_summary=dplyr::summarise(group_by(df_survey_subjUsability, group, itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# display frequency as raw counts
H2.1_p_FreqUserResponses = ggplot(data=df_survey_subjUsability_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`5`="\"Feedback helped me.\"",`6`="\"I could use the feedback.\"")))+
  labs(title="Subjective helpfulness / usability of CFEs",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
H2.1_p_PercUserResponses = ggplot(data=df_survey_subjUsability_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`5`="\"Feedback helped me.\"",`6`="\"I could use the feedback.\"")))+
  labs(title="Subjective helpfulness / usability of CFEs",x="", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# safe summary to generate likert plots:
likert_subjHelpUsability=df_survey_subjUsability_summary

# put all plots together
H2.1_figure1_UserResponses <- ggarrange(H2.1_p_FreqUserResponses,H2.1_p_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H2.1_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing user responses in relevant survey items:")
H2.1_figure1_UserResponses

```

On to the statistical comparison: for Likert-scale, we want a non-parametric statistical test for ordinal data, that's the Wilcoxon–Mann–Whitney U test.

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}

# extract response numbers per participant
df_survey_subjHelpfulness_responses = df_survey_subjUsability[df_survey_subjUsability$itemNo==5,]
df_survey_subjHelpfulness_responses = df_survey_subjHelpfulness_responses[!df_survey_subjHelpfulness_responses$checked==0,]
# take out "prefer not to answer" replies
df_survey_subjHelpfulness_responses = df_survey_subjHelpfulness_responses[!df_survey_subjHelpfulness_responses$responseNo==6,]

df_survey_subjUsability_responses = df_survey_subjUsability[df_survey_subjUsability$itemNo==6,]
df_survey_subjUsability_responses = df_survey_subjUsability_responses[!df_survey_subjUsability_responses$checked==0,]
# take out "prefer not to answer" replies
df_survey_subjUsability_responses = df_survey_subjUsability_responses[!df_survey_subjUsability_responses$responseNo==6,]

# treat as numeric to compute stats
df_survey_subjHelpfulness_responses$responseNo=as.numeric(df_survey_subjHelpfulness_responses$responseNo)
df_survey_subjUsability_responses$responseNo=as.numeric(df_survey_subjUsability_responses$responseNo)

# compute summary stats (mean, sd, sem)
df_survey_subjHelpfulness_summary=data_summary(df_survey_subjHelpfulness_responses, varname="responseNo", 
                    groupnames=c("group"))

df_survey_subjUsability_summary=data_summary(df_survey_subjUsability_responses, varname="responseNo", 
                    groupnames=c("group"))

# for plotting:
df_survey_subjHelpUsability_responses = df_survey_subjUsability[!df_survey_subjUsability$checked==0,]
df_survey_subjHelpUsability_responses = df_survey_subjHelpUsability_responses[!df_survey_subjHelpUsability_responses$responseNo==6,]
df_survey_subjHelpUsability_responses$responseNo=as.numeric(df_survey_subjHelpUsability_responses$responseNo)

# plot distribution
H2.1_figure2_UserResponses_violin = ggplot(df_survey_subjHelpUsability_responses, aes(x=group, y=responseNo, color=group, fill=group)) + 
  geom_violin(scale = "count")+
  geom_boxplot(width=0.1)+
  #geom_jitter(height = 0, width = 0.1)+ # ADD THIS TO DISPLAY INDIVIDUAL DATA POINTS!
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`5`="\"Feedback helped me.\"",`6`="\"I could use the feedback.\""))) +
  stat_summary(fun=mean, geom="point", shape=23, size=2, color="red")  +
  labs(title="Mean response for subjective helpfulness / usability",x="Group", y = "Response")+
  scale_y_continuous(limits = c(1, 5),breaks=1:5, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=alpha(c(Ccol,Pcol),0.2), labels = c("Control", "CFE"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(plot.title = element_text(hjust = 0.5))

# save
ggsave("Figures/H2.1_figure2_UserResponses_violin_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# Stacked bar plot, the more appropriate visualization for likert data:
# separate "high" and "low" values
lows=likert_subjHelpUsability[as.numeric(likert_subjHelpUsability$responseNo)<4,]
highs=likert_subjHelpUsability[as.numeric(likert_subjHelpUsability$responseNo)>2,]
# NOTE: we take response no 6 out here, after percentages were computed
# THUS, THE RESPONSES WILL NOT SUM UP TO 100% ID PEOPLE CHOSE NOT TO ANSWER!
highs=highs[!highs$responseNo==6,]
  
# divide neutral category by 2 for plotting:
lows$PercUsersChecked[lows$responseNo==3]=lows$PercUsersChecked[lows$responseNo==3]/2
highs$PercUsersChecked[highs$responseNo==3]=highs$PercUsersChecked[highs$responseNo==3]/2

H2.1_figure2_UserResponses_stackedBar = ggplot() + scale_x_discrete(labels=c("E"="CFE","C"="Control")) +
  # annotation:
  annotate("rect", xmin = 1.6, xmax = 2.4, ymin = -100, ymax = 100, alpha=0.2, fill = Pcol)+
  annotate("rect", xmin = 0.6, xmax = 1.4, ymin = -100, ymax = 100, alpha=0.2, fill = Ccol)+
  # geom_bar, add data high:
  geom_bar(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y=PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y = PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # geom_bar, add data low:
  geom_bar(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # flip coordinates:
  coord_flip(xlim=c(0.99,2.3)) +
  # more aesthetics:
  geom_hline(yintercept = 0, color =c("black"),size=0.1) +
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`5`="\"Feedback helped me.\"",`6`="\"I could use the feedback.\"")))+
  theme_bw(base_size = 10)+
theme(legend.position = "bottom",legend.title=element_blank(),legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"),plot.title = element_text(hjust = 0.5))+
  labs(title="Subjective helpfulness / usability of CFEs",x="Group", y ="% of users")+
  scale_fill_manual("User response", values = likert_Okabe_Ito_palette,labels = c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_y_continuous(labels=c(100,50,0,50,100))

# save
ggsave("Figures/H2.1_figure2_UserResponses_stackedBar_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# show plots
# print("Mean user response for subjective helpfulness / usability:")
# H2.1_figure2_UserResponses_violin
# H2.1_figure2_UserResponses_stackedBar

# run stats, helpfulness judgements:
item5t="Wilcox"
item5test=uk_wilcox.test(df_survey_subjHelpfulness_responses$responseNo[df_survey_subjHelpfulness_responses$group=="E"],df_survey_subjHelpfulness_responses$responseNo[df_survey_subjHelpfulness_responses$group=="C"],paired=FALSE,exact=FALSE)
item5effsize=item5test$z_val/(sqrt(nrow(df_survey_subjHelpfulness_responses)))

matchingRes=paste(matchingRes,paste("\n","H2.1_UserJudgementsHelpfulness",sep=""),"NA",item5t,item5test$p.value,item5effsize,sep = ",")

# run stats, usability judgements:
item6t="Wilcox"
item6test=uk_wilcox.test(df_survey_subjUsability_responses$responseNo[df_survey_subjUsability_responses$group=="E"],df_survey_subjUsability_responses$responseNo[df_survey_subjUsability_responses$group=="C"],paired=FALSE,exact=FALSE)
item6effsize=item6test$z_val/(sqrt(nrow(df_survey_subjUsability_responses)))

matchingRes=paste(matchingRes,paste("\n","H2.1_UserJudgementsUsability",sep=""),"NA",item6t,item6test$p.value,item6effsize,sep = ",")

```


The analysis revealed:

* Is there a significant difference in terms of subjective helpfulness between groups? We compared responses for subjective helpfulness for users in explanation condition (M = `r df_survey_subjHelpfulness_summary$mean[df_survey_subjHelpfulness_summary$group=="E"]`, SEM = `r df_survey_subjHelpfulness_summary$sem[df_survey_subjHelpfulness_summary$group=="E"]`) and users in the control condition (M = `r df_survey_subjHelpfulness_summary$mean[df_survey_subjHelpfulness_summary$group=="C"]`, SEM = `r df_survey_subjHelpfulness_summary$sem[df_survey_subjHelpfulness_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item5test$statistic `, p=`r item5test$p.value `, r = `r item5effsize `

* Is there a significant difference in terms of subjective usability?: We compared responses for subjective usability for users in explanation condition (M = `r df_survey_subjUsability_summary$mean[df_survey_subjUsability_summary$group=="E"]`, SEM = `r df_survey_subjUsability_summary$sem[df_survey_subjUsability_summary$group=="E"]`) and users in the control condition (M = `r df_survey_subjUsability_summary$mean[df_survey_subjUsability_summary$group=="C"]`, SEM = `r df_survey_subjUsability_summary$sem[df_survey_subjUsability_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item6test$statistic `, p=`r item6test$p.value `, r = `r item6effsize `


```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# add significance annotation to likert plots:

anno_df <- data.frame(label = c("*** (p<.001)", "*** (p<.001)"),itemNo   = c(5, 6))
H2.1_figure2_UserResponses_stackedBar_anno = H2.1_figure2_UserResponses_stackedBar + geom_text(
  data = anno_df, mapping = aes(x = Inf, y = -Inf, label = label), hjust = -0.1, vjust = 1.5, size=2, fontface = "italic")

# save
ggsave("Figures/H2.1_figure2_UserResponses_stackedBar_anno_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# show plots
print("Mean user response for subjective helpfulness / usability:")
H2.1_figure2_UserResponses_stackedBar_anno

```

### H2.2) Users provided with CFEs imagine this setting to be more helpful for others users, too (survey item 9).

item 9: "I think most people would learn to work with the feedback on what choice would have led to a better result very quickly."

Do users in the explanation condition imagine that explanations would be more helpful for other users, compared to users in the control condition?

```{r echo=FALSE, fig.height = 6, fig.width = 4, fig.align = "center"}

# Descriptive stats
# subset to get only relevant data
df_survey_UsabilityForOthers=df_survey[df_survey$itemNo==9,]

# set responseNo to factor
df_survey_UsabilityForOthers$responseNo=as.factor(df_survey_UsabilityForOthers$responseNo)

summary(df_survey_UsabilityForOthers)

# summarize to get overview values of frequencies and percentages
df_survey_UsabilityForOthers_summary=dplyr::summarise(group_by(df_survey_UsabilityForOthers, group, itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# display frequency as raw counts
H2.2_p_FreqUserResponses = ggplot(data=df_survey_UsabilityForOthers_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`9`="\"Most people would learn quickly.\"")))+
  labs(title="Subjective helpfulness for others",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
H2.2_p_PercUserResponses = ggplot(data=df_survey_UsabilityForOthers_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`9`="\"Most people would learn quickly.\"")))+
  labs(title="Subjective helpfulness for others",x="", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# safe summary to generate likert plots:
likert_UsabilityForOthers=df_survey_UsabilityForOthers_summary

# put all plots together
H2.2_figure1_UserResponses <- ggarrange(H2.2_p_FreqUserResponses,H2.2_p_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H2.2_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing user responses in relevant survey items:")
H2.2_figure1_UserResponses

```

Check for significant differences between groups using the Wilcoxon–Mann–Whitney U test, as we have Likert-scale data.

```{r echo=FALSE, fig.height = 2, fig.width = 5, fig.align = "center"}

# extract response numbers per participant
df_survey_UsabilityForOthers_responses = df_survey_UsabilityForOthers[!df_survey_UsabilityForOthers$checked==0,]
# take out "prefer not to answer" replies
df_survey_UsabilityForOthers_responses = df_survey_UsabilityForOthers_responses[!df_survey_UsabilityForOthers_responses$responseNo==6,]

# treat as numeric to compute stats
df_survey_UsabilityForOthers_responses$responseNo=as.numeric(df_survey_UsabilityForOthers_responses$responseNo)

# compute summary stats (mean, sd, sem)
df_survey_UsabilityForOthers_summary=data_summary(df_survey_UsabilityForOthers_responses, varname="responseNo", 
                    groupnames=c("group"))

# for plotting:
df_survey_UsabilityForOthers_responses$responseNo=as.numeric(df_survey_UsabilityForOthers_responses$responseNo)

# plot distribution
H2.2_figure2_UserResponses_violin = ggplot(df_survey_UsabilityForOthers_responses, aes(x=group, y=responseNo, color=group, fill=group)) +
  geom_violin(scale = "count")+
  geom_boxplot(width=0.1)+
  #geom_jitter(height = 0, width = 0.1)+ # ADD THIS TO DISPLAY INDIVIDUAL DATA POINTS!
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`9`="\"Most people would learn quickly.\""))) +
  stat_summary(fun=mean, geom="point", shape=23, size=2, color="red")  +
  labs(title="Subjective helpfulness for others",x="Group", y = "Response")+
  scale_y_continuous(limits = c(1, 5),breaks=1:5, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=alpha(c(Ccol,Pcol),0.2), labels = c("Control", "CFE"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(plot.title = element_text(hjust = 0.5))

# save
ggsave("Figures/H2.2_figure2_UserResponses_violin_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)


# Stacked bar plot, the more appropriate visualization for likert data:
# separate "high" and "low" values
lows=likert_UsabilityForOthers[as.numeric(likert_UsabilityForOthers$responseNo)<4,]
highs=likert_UsabilityForOthers[as.numeric(likert_UsabilityForOthers$responseNo)>2,]
# NOTE: we take response no 6 out here, after percentages were computed
# THUS, THE RESPONSES WILL NOT SUM UP TO 100% ID PEOPLE CHOSE NOT TO ANSWER!
highs=highs[!highs$responseNo==6,]
  
# divide neutral category by 2 for plotting:
lows$PercUsersChecked[lows$responseNo==3]=lows$PercUsersChecked[lows$responseNo==3]/2
highs$PercUsersChecked[highs$responseNo==3]=highs$PercUsersChecked[highs$responseNo==3]/2

H2.2_figure2_UserResponses_stackedBar = ggplot() + scale_x_discrete(labels=c("E"="CFE","C"="Control")) +
  # annotation:
  annotate("rect", xmin = 1.6, xmax = 2.4, ymin = -100, ymax = 100, alpha=0.2, fill = Pcol)+
  annotate("rect", xmin = 0.6, xmax = 1.4, ymin = -100, ymax = 100, alpha=0.2, fill = Ccol)+
  # geom_bar, add data high:
  geom_bar(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y=PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y = PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # geom_bar, add data low:
  geom_bar(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # flip coordinates:
  coord_flip(xlim=c(0.99,2.3)) +
  # more aesthetics:
  geom_hline(yintercept = 0, color =c("black"),size=0.1) +
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`9`="\"Most people would learn quickly.\"")))+
  theme_bw(base_size = 10)+
theme(legend.position = "bottom",legend.title=element_blank(),legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"),plot.title = element_text(hjust = 0.5))+
  labs(title="Subjective helpfulness for others",x="Group", y ="% of users")+
  scale_fill_manual("User response", values = likert_Okabe_Ito_palette,labels = c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_y_continuous(labels=c(100,50,0,50,100))

# save
ggsave("Figures/H2.2_figure2_UserResponses_stackedBar_IAZ_EXP1_FINAL.pdf",width = 6, height = 2,)

# # show plot
# print("Mean user response for subjective helpfulness / usability:")
# H2.2_figure2_UserResponses_violin
# H2.2_figure2_UserResponses_stackedBar

# run stats, helpfulness judgements:
item9t="Wilcox"
item9test=uk_wilcox.test(df_survey_UsabilityForOthers_responses$responseNo[df_survey_UsabilityForOthers_responses$group=="E"],df_survey_UsabilityForOthers_responses$responseNo[df_survey_UsabilityForOthers_responses$group=="C"],paired=FALSE,exact=FALSE)
item9effsize=item9test$z_val/(sqrt(nrow(df_survey_UsabilityForOthers_responses)))

matchingRes=paste(matchingRes,paste("\n","H2.1_UserJudgementsHelpfulness",sep=""),"NA",item9t,item9test$p.value,item9effsize,sep = ",")

```

The analysis revealed:

* Is there a significant difference in terms of estimated usefulness for others between groups? We compared number of matches for users in explanation condition (M = `r df_survey_UsabilityForOthers_summary$mean[df_survey_UsabilityForOthers_summary$group=="E"]`, SEM = `r df_survey_UsabilityForOthers_summary$sem[df_survey_UsabilityForOthers_summary$group=="E"]`) and users in the control condition (M = `r df_survey_UsabilityForOthers_summary$mean[df_survey_UsabilityForOthers_summary$group=="C"]`, SEM = `r df_survey_UsabilityForOthers_summary$sem[df_survey_UsabilityForOthers_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item9test$statistic `, p=`r item9test$p.value `, r = `r item9effsize `

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# add significance annotation to likert plot:

anno_df <- data.frame(label = c("n.s. (p=.674)"),itemNo   = c(9))
H2.2_figure2_UserResponses_stackedBar_anno=H2.2_figure2_UserResponses_stackedBar + geom_text(
  data = anno_df, mapping = aes(x = Inf, y = -Inf, label = label), hjust = -0.1, vjust = 1.5, size=2, fontface = "italic")

# save
ggsave("Figures/H2.2_figure2_UserResponses_stackedBar_anno_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# display:
H2.2_figure2_UserResponses_stackedBar_anno

```

## H2) Final plot for publication

```{r echo=FALSE, fig.height = 2, fig.width = 7, fig.align = "center"}
# make a H2 plot for publication
# remove superfluous info before putting it all together
p1=H2.1_figure2_UserResponses_stackedBar_anno+theme(title=element_blank(),
        axis.title.x=element_blank(),
        #axis.text.x=element_blank(),
        #axis.ticks.x=element_blank(),
        panel.spacing = unit(1, "lines"))
p1$labels$title=c()

p1_T=H2.1_figure2_UserResponses_stackedBar_anno+facet_wrap(vars(itemNo),nrow = 1, ncol = 2,labeller=as_labeller(c(`5`="\"Feedback helped me.\"",`6`="\"I could use the feedback.\"")))+
  theme(panel.spacing = unit(1, "lines"),axis.title.x=element_blank())

p1_T$labels$title=c()

H2.2_figure2_UserResponses_stackedBar_anno$labels$title=c()
p2=H2.2_figure2_UserResponses_stackedBar_anno+theme(axis.title.y=element_blank())

p2_T=H2.2_figure2_UserResponses_stackedBar_anno+theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.title.x=element_blank())

H2_p_UserJudgements=ggarrange(p1,p2, ncol = 1, nrow = 2, heights=c(4,2.5), common.legend = TRUE, legend="bottom")
# save
ggsave("Figures/H2_p_UserJudgements_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

H2_p_UserJudgements_T=ggarrange(p1_T,p2_T, ncol = 2, nrow = 1, widths=c(6,2.5), common.legend = TRUE, legend="bottom")

# save
ggsave("Figures/H2_p_UserJudgements_T_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

# re-define without legend for later use
H2_p_UserJudgements_T=ggarrange(p1_T,p2_T, ncol = 2, nrow = 1, widths=c(6,2.5), legend="none")

# display:
H2_p_UserJudgements_T

```

## H3) No expected differences in understanding the explanations per se

Coming to areas where we do not expect differences between groups.
CAREFUL though: Remember that Null findings cannot be interpreted, so discuss with caution. However, this may act as an important control to make sure groups don't differ in a weird way.

Revisiting the hypothesis:

*H3) We do not expect users in different conditions to differ in terms of how well they understood the feedback, or needing support for understanding (survey items 3, 4).*

Item 3: "I understood the feedback on what choice would have led to a better result."

Item 4: "I needed support to understand the feedback on what choice would have led to a better result."

```{r echo=FALSE, fig.height = 6, fig.width = 4, fig.align = "center"}

# Descriptive stats
# subset to get only relevant data
df_survey_understanding=df_survey[(df_survey$itemNo==3 | df_survey$itemNo==4),]

# set itemNo and responseNo to factors, too
df_survey_understanding$itemNo=as.factor(df_survey_understanding$itemNo)
df_survey_understanding$responseNo=as.factor(df_survey_understanding$responseNo)

summary(df_survey_understanding)

# summarize to get overview values of frequencies and percentages
df_survey_understanding_summary=dplyr::summarise(group_by(df_survey_understanding, group, itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# display frequency as raw counts
H3_FreqUserResponses = ggplot(data=df_survey_understanding_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`3`="\"I understood the feedback.\"",`4`="\"I needed support to understand.\"")))+
  labs(title="Subjective understanding of CFEs",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
H3_PercUserResponses = ggplot(data=df_survey_understanding_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`3`="\"I understood the feedback.\"",`4`="\"I needed support to understand.\"")))+
  labs(title="Subjective understanding of CFEs",x="", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

likert_understanding=df_survey_understanding_summary

# put all plots together
H3_figure1_UserResponses <- ggarrange(H3_FreqUserResponses,H3_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H3_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing user responses in relevant survey items:")
H3_figure1_UserResponses

```

On to the statistical comparison: for Likert-scale, we want a non-parametric statistial test for ordinal data, that's the Wilcoxon–Mann–Whitney U test.

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}

# extract response numbers per participant
df_survey_understanding_responses = df_survey_understanding[df_survey_understanding$itemNo==3,]
df_survey_understanding_responses = df_survey_understanding_responses[!df_survey_understanding_responses$checked==0,]
# take out "prefer not to answer" replies
df_survey_understanding_responses = df_survey_understanding_responses[!df_survey_understanding_responses$responseNo==6,]

df_survey_support4understanding_responses = df_survey_understanding[df_survey_understanding$itemNo==4,]
df_survey_support4understanding_responses = df_survey_support4understanding_responses[!df_survey_support4understanding_responses$checked==0,]
# take out "prefer not to answer" replies
df_survey_support4understanding_responses = df_survey_support4understanding_responses[!df_survey_support4understanding_responses$responseNo==6,]

# treat as numeric to compute stats
df_survey_understanding_responses$responseNo=as.numeric(df_survey_understanding_responses$responseNo)
df_survey_support4understanding_responses$responseNo=as.numeric(df_survey_support4understanding_responses$responseNo)

# compute summary stats (mean, sd, sem)
df_survey_understanding_summary=data_summary(df_survey_understanding_responses, varname="responseNo", 
                    groupnames=c("group"))

df_survey_support4understanding_summary=data_summary(df_survey_support4understanding_responses, varname="responseNo", 
                    groupnames=c("group"))

# for plotting:
df_survey_understanding_support_responses = df_survey_understanding[!df_survey_understanding$checked==0,]
df_survey_understanding_support_responses = df_survey_understanding_support_responses[!df_survey_understanding_support_responses$responseNo==6,]
df_survey_understanding_support_responses$responseNo=as.numeric(df_survey_understanding_support_responses$responseNo)

# plot distribution
H3_figure2_UserResponses_violin = ggplot(df_survey_understanding_support_responses, aes(x=group, y=responseNo, color=group, fill=group)) + 
  geom_violin(scale = "count")+
  geom_boxplot(width=0.1)+
  #geom_jitter(height = 0, width = 0.1)+ # ADD THIS TO DISPLAY INDIVIDUAL DATA POINTS!
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`3`="\"I understood the feedback.\"",`4`="\"I needed support to understand.\""))) +
  stat_summary(fun=mean, geom="point", shape=23, size=2, color="red")  +
  labs(title="Mean response for understanding\n/ support for understanding",x="Group", y = "Response")+
  scale_y_continuous(limits = c(1, 5),breaks=1:5, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=alpha(c(Ccol,Pcol),0.2), labels = c("Control", "CFE"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(plot.title = element_text(hjust = 0.5))

# save
ggsave("Figures/H3_figure2_UserResponses_violin_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)


# Stacked bar plot, the more appropriate visualization for likert data:
# separate "high" and "low" values
lows=likert_understanding[as.numeric(likert_understanding$responseNo)<4,]
highs=likert_understanding[as.numeric(likert_understanding$responseNo)>2,]
# NOTE: we take response no 6 out here, after percentages were computed
# THUS, THE RESPONSES WILL NOT SUM UP TO 100% ID PEOPLE CHOSE NOT TO ANSWER!
highs=highs[!highs$responseNo==6,]
  
# divide neutral category by 2 for plotting:
lows$PercUsersChecked[lows$responseNo==3]=lows$PercUsersChecked[lows$responseNo==3]/2
highs$PercUsersChecked[highs$responseNo==3]=highs$PercUsersChecked[highs$responseNo==3]/2

H3_figure2_UserResponses_stackedBar = ggplot() + scale_x_discrete(labels=c("E"="CFE","C"="Control")) +
  # annotation:
  annotate("rect", xmin = 1.6, xmax = 2.4, ymin = -100, ymax = 100, alpha=0.2, fill = Pcol)+
  annotate("rect", xmin = 0.6, xmax = 1.4, ymin = -100, ymax = 100, alpha=0.2, fill = Ccol)+
  # geom_bar, add data high:
  geom_bar(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y=PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y = PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # geom_bar, add data low:
  geom_bar(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # flip coordinates:
  coord_flip(xlim=c(0.99,2.3)) +
  # more aesthetics:
  geom_hline(yintercept = 0, color =c("black"),size=0.1) +
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`3`="\"I understood the feedback.\"",`4`="\"I needed support to understand.\"")))+
  theme_bw(base_size = 10)+
theme(legend.position = "bottom",legend.title=element_blank(),legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"),plot.title = element_text(hjust = 0.5))+
  labs(title="Subjective understanding/\nneeding support for understanding",x="Group", y ="% of users")+
  scale_fill_manual("User response", values = likert_Okabe_Ito_palette,labels = c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_y_continuous(labels=c(100,50,0,50,100))

# save
ggsave("Figures/H3_figure2_UserResponses_stackedBar_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# # show plot
# print("Mean user response for understanding / need for support to understand:")
# H3_figure2_UserResponses_violin
# H3_figure2_UserResponses_stackedBar

# run stats, helpfulness judgements:
item3t="Wilcox"
item3test=uk_wilcox.test(df_survey_understanding_responses$responseNo[df_survey_understanding_responses$group=="E"],df_survey_subjHelpfulness_responses$responseNo[df_survey_understanding_responses$group=="C"],paired=FALSE,exact=FALSE)
item3effsize=item3test$z_val/(sqrt(nrow(df_survey_understanding_responses)))

matchingRes=paste(matchingRes,paste("\n","H3_UserJudgementsUnderstanding",sep=""),"NA",item3t,item3test$p.value,item3effsize,sep = ",")

# run stats, usability judgements:
item4t="Wilcox"
item4test=uk_wilcox.test(df_survey_support4understanding_responses$responseNo[df_survey_support4understanding_responses$group=="E"],df_survey_support4understanding_responses$responseNo[df_survey_support4understanding_responses$group=="C"],paired=FALSE,exact=FALSE)
item4effsize=item4test$z_val/(sqrt(nrow(df_survey_support4understanding_responses)))

matchingRes=paste(matchingRes,paste("\n","H3_UserJudgementsSupport4Understanding",sep=""),"NA",item4t,item4test$p.value,item4effsize,sep = ",")

```


The analysis revealed:

* Is there a significant difference in terms of understanding of feedback between groups? We compared responses of users in explanation condition (M = `r df_survey_understanding_summary$mean[df_survey_understanding_summary$group=="E"]`, SEM = `r df_survey_understanding_summary$sem[df_survey_understanding_summary$group=="E"]`) and users in the control condition (M = `r df_survey_understanding_summary$mean[df_survey_understanding_summary$group=="C"]`, SEM = `r df_survey_understanding_summary$sem[df_survey_understanding_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item3test$statistic `, p=`r item3test$p.value `, r = `r item3effsize `

* Is there a significant difference in terms of needing support to understand feedback?: We compared responses of users in explanation condition (M = `r df_survey_support4understanding_summary$mean[df_survey_support4understanding_summary$group=="E"]`, SEM = `r df_survey_support4understanding_summary$sem[df_survey_support4understanding_summary$group=="E"]`) and users in the control condition (M = `r df_survey_support4understanding_summary$mean[df_survey_support4understanding_summary$group=="C"]`, SEM = `r df_survey_support4understanding_summary$sem[df_survey_support4understanding_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item4test$statistic `, p=`r item4test$p.value `, r = `r item4effsize `

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# add significance annotation to likert plot:

anno_df <- data.frame(label = c("*** (p<.001)","n.s. (p=.674)"),itemNo   = c(3,4))
H3_figure2_UserResponses_stackedBar_anno=H3_figure2_UserResponses_stackedBar + geom_text(
  data = anno_df, mapping = aes(x = Inf, y = -Inf, label = label), hjust = -0.1, vjust = 1.5, size=2, fontface = "italic")

# save
ggsave("Figures/H3_figure2_UserResponses_stackedBar_anno_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# # show plot
print("Mean user response for understanding / need for support to understand:")
H3_figure2_UserResponses_stackedBar_anno

```


## H4) Timing and efficacy of how CFEs were presented expected to be comparable

H4) We expect timing and efficacy of how CFEs were presented to be comparable, as it was literally the same (survey item 10) - a further control.

Item 10: "I received the feedback on what choice would have led to a better result in a timely and efficient manner."

```{r echo=FALSE, fig.height = 4, fig.width = 4, fig.align = "center"}

# Descriptive stats
# subset to get only relevant data
df_survey_TimingEfficacy=df_survey[df_survey$itemNo==10,]

# set responseNo to factor
df_survey_TimingEfficacy$responseNo=as.factor(df_survey_TimingEfficacy$responseNo)

summary(df_survey_TimingEfficacy)

# summarize to get overview values of frequencies and percentages
df_survey_TimingEfficacy_summary=dplyr::summarise(group_by(df_survey_TimingEfficacy, group,itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# display frequency as raw counts
H4_FreqUserResponses = ggplot(data=df_survey_TimingEfficacy_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`10`="\"I received feedback timely and efficient.\"")))+
  labs(title="Presentation of CFEs",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
H4_PercUserResponses = ggplot(data=df_survey_TimingEfficacy_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`10`="\"Feedback was timely and efficient.\"")))+
  labs(title="Presentation of CFEs",x="", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

likert_TimingEfficacy=df_survey_TimingEfficacy_summary

# put all plots together
H4_figure1_UserResponses <- ggarrange(H4_FreqUserResponses,H4_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/H4_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing user responses in relevant survey items:")
H4_figure1_UserResponses

```


Check for significant differences between groups using the Wilcoxon–Mann–Whitney U test, as we have Likert-scale data.

```{r echo=FALSE, fig.height = 2, fig.width = 5, fig.align = "center"}

# extract response numbers per participant
df_survey_TimingEfficacy_responses = df_survey_TimingEfficacy[!df_survey_TimingEfficacy$checked==0,]
# take out "prefer not to answer" replies
df_survey_TimingEfficacy_responses = df_survey_TimingEfficacy_responses[!df_survey_TimingEfficacy_responses$responseNo==6,]

# treat as numeric to compute stats
df_survey_TimingEfficacy_responses$responseNo=as.numeric(df_survey_TimingEfficacy_responses$responseNo)

# compute summary stats (mean, sd, sem)
df_survey_TimingEfficacy_summary=data_summary(df_survey_TimingEfficacy_responses, varname="responseNo", 
                    groupnames=c("group"))

# for plotting:
df_survey_TimingEfficacy_responses$responseNo=as.numeric(df_survey_TimingEfficacy_responses$responseNo)

# plot distribution
H4_figure2_UserResponses_violin = ggplot(df_survey_TimingEfficacy_responses, aes(x=group, y=responseNo, color=group, fill=group)) +
  geom_violin(scale = "count")+
  geom_boxplot(width=0.1)+
  #geom_jitter(height = 0, width = 0.1)+ # ADD THIS TO DISPLAY INDIVIDUAL DATA POINTS!
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`10`="\"Feedback was timely and efficient.\""))) +
  stat_summary(fun=mean, geom="point", shape=23, size=2, color="red")  +
  labs(title="Mean response re. presentation of CFEs",x="Group", y = "Response")+
  scale_y_continuous(limits = c(1, 5),breaks=1:5, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=alpha(c(Ccol,Pcol),0.2), labels = c("Control", "CFE"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(plot.title = element_text(hjust = 0.5))

# save
ggsave("Figures/H4_figure2_UserResponses_violin_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)


# Stacked bar plot, the more appropriate visualization for likert data:
# separate "high" and "low" values
lows=likert_TimingEfficacy[as.numeric(likert_TimingEfficacy$responseNo)<4,]
highs=likert_TimingEfficacy[as.numeric(likert_TimingEfficacy$responseNo)>2,]
# NOTE: we take response no 6 out here, after percentages were computed
# THUS, THE RESPONSES WILL NOT SUM UP TO 100% ID PEOPLE CHOSE NOT TO ANSWER!
highs=highs[!highs$responseNo==6,]

# divide neutral category by 2 for plotting:
lows$PercUsersChecked[lows$responseNo==3]=lows$PercUsersChecked[lows$responseNo==3]/2
highs$PercUsersChecked[highs$responseNo==3]=highs$PercUsersChecked[highs$responseNo==3]/2

H4_figure2_UserResponses_stackedBar = ggplot() + scale_x_discrete(labels=c("E"="CFE","C"="Control")) +
  # annotation:
  annotate("rect", xmin = 1.6, xmax = 2.4, ymin = -100, ymax = 100, alpha=0.2, fill = Pcol)+
  annotate("rect", xmin = 0.6, xmax = 1.4, ymin = -100, ymax = 100, alpha=0.2, fill = Ccol)+
  # geom_bar, add data high:
  geom_bar(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y=PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y = PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # geom_bar, add data low:
  geom_bar(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # flip coordinates:
  coord_flip(xlim=c(0.99,2.3)) +
  # more aesthetics:
  geom_hline(yintercept = 0, color =c("black"),size=0.1) +
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`10`="\"Feedback was timely and efficient.\"")))+
  theme_bw(base_size = 10)+
theme(legend.position = "bottom",legend.title=element_blank(),legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"),plot.title = element_text(hjust = 0.5))+
  labs(title="Presentation of CFEs",x="Group", y ="% of users")+
  scale_fill_manual("User response", values = likert_Okabe_Ito_palette,labels = c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_y_continuous(labels=c(100,50,0,50,100))

# save
ggsave("Figures/H4_figure2_UserResponses_stackedBar_IAZ_EXP1_FINAL.pdf",width = 6, height = 2,)

# # show plot
# print("Mean user response for subjective helpfulness / usability:")
# H4_figure2_UserResponses_violin
# H4_figure2_UserResponses_stackedBar

# run stats, helpfulness judgements:
item10t="Wilcox"
item10test=uk_wilcox.test(df_survey_TimingEfficacy_responses$responseNo[df_survey_TimingEfficacy_responses$group=="E"],df_survey_TimingEfficacy_responses$responseNo[df_survey_TimingEfficacy_responses$group=="C"],paired=FALSE,exact=FALSE)
item10effsize=item10test$z_val/(sqrt(nrow(df_survey_TimingEfficacy_responses)))

matchingRes=paste(matchingRes,paste("\n","H4_UserJudgementsTimeEfficiency",sep=""),"NA",item10t,item10test$p.value,item10effsize,sep = ",")

```

The analysis revealed:

* Is there a significant difference in terms of estimated usefulness for others between groups? We compared number of matches for users in explanation condition (M = `r df_survey_TimingEfficacy_summary$mean[df_survey_TimingEfficacy_summary$group=="E"]`, SEM = `r df_survey_TimingEfficacy_summary$sem[df_survey_TimingEfficacy_summary$group=="E"]`) and users in the control condition (M = `r df_survey_TimingEfficacy_summary$mean[df_survey_TimingEfficacy_summary$group=="C"]`, SEM = `r df_survey_TimingEfficacy_summary$sem[df_survey_TimingEfficacy_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item10test$statistic `, p=`r item10test$p.value `, r = `r item10effsize `

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# add significance annotation to likert plot:

anno_df <- data.frame(label = c("n.s. (p=.289)"),itemNo   = c(10))
H4_figure2_UserResponses_stackedBar_anno=H4_figure2_UserResponses_stackedBar + geom_text(
  data = anno_df, mapping = aes(x = Inf, y = -Inf, label = label), hjust = -0.1, vjust = 1.5, size=2, fontface = "italic")

# save
ggsave("Figures/H4_figure2_UserResponses_stackedBar_anno_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# # show plot
print("Mean user response for subjective helpfulness / usability:")
H4_figure2_UserResponses_stackedBar_anno

```


```{r echo=FALSE, fig.height = 2, fig.width = 7, fig.align = "center"}
# make a H3 plot for publication [combine H3+H4 in speculation about mode of presentation] 
# remove superfluous info before putting it all together
p1=H3_figure2_UserResponses_stackedBar_anno+theme(title=element_blank(),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        panel.spacing = unit(1, "lines"))
p1$labels$title=c()

p1_T=H3_figure2_UserResponses_stackedBar_anno+facet_wrap(vars(itemNo),nrow = 1, ncol = 2,labeller=as_labeller(c(`3`="\"I understood the feedback.\"",`4`="\"I needed support to understand.\"")))+
  theme(panel.spacing = unit(1, "lines"),axis.title.x=element_blank())
p1_T$labels$title=c()

H4_figure2_UserResponses_stackedBar_anno$labels$title=c()
p2=H4_figure2_UserResponses_stackedBar_anno+theme(axis.title.y=element_blank())

p2_T=H4_figure2_UserResponses_stackedBar_anno+theme(title=element_blank(),
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

H3_p_UserJudgements=ggarrange(p1,p2, ncol = 1, nrow = 2, heights=c(4,2.5), common.legend = TRUE, legend="bottom")

# save
ggsave("Figures/H3_p_UserJudgements_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

H3_p_UserJudgements_T=ggarrange(p1_T,p2_T, ncol = 2, nrow = 1, widths=c(6,2.5), common.legend = TRUE, legend="bottom")
# save
ggsave("Figures/H3_p_UserJudgements_T_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

# re-define without legend for later use
H3_p_UserJudgements_T=ggarrange(p1_T,p2_T, ncol = 2, nrow = 1, widths=c(6,2.5), legend="none")

# # show plot
print("Summary plot for H3:")
H3_p_UserJudgements_T

```

## Final exploratory analysis

It is not clear whether users uncovered inconsistencies in the feedback. Let's ask them.

Item 8: "I found inconsistencies in the feedback presented."

```{r echo=FALSE, fig.height = 4, fig.width = 4, fig.align = "center"}

# Descriptive stats
# subset to get only relevant data
df_survey_Inconsist=df_survey[df_survey$itemNo==8,]

# set responseNo to factor
df_survey_Inconsist$responseNo=as.factor(df_survey_Inconsist$responseNo)

summary(df_survey_Inconsist)

# summarize to get overview values of frequencies and percentages
df_survey_Inconsist_summary=dplyr::summarise(group_by(df_survey_Inconsist, group,itemNo, responseNo),
          SumChecks=sum(checked),
          PercUsersChecked=100*(sum(checked)/length(unique(userId))))

# display frequency as raw counts
Expl_FreqUserResponses = ggplot(data=df_survey_Inconsist_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = SumChecks),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`8`="\"I found inconsistencies.\"")))+
  labs(title="Detected inconsistencies of CFEs",x="", y = "Frequency of answer")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

# display frequency as percentage of all per group users (how many % of explanation users picked plant 1?)
Expl_PercUserResponses = ggplot(data=df_survey_Inconsist_summary, aes(x=responseNo,fill = group)) + 
  geom_bar(aes(y = PercUsersChecked),stat="identity",position = position_dodge(preserve = "single"))+
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`8`="\"I found inconsistencies.\"")))+
  labs(title="Detected inconsistencies of CFEs",x="", y = "% of users per group")+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_x_discrete(breaks=1:6, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree","Prefer not\nto answer"))+
  theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 60,hjust = 0.95))

likert_Inconsist=df_survey_Inconsist_summary

# put all plots together
Expl_figure1_UserResponses <- ggarrange(Expl_FreqUserResponses,Expl_PercUserResponses,
                    ncol = 1, nrow = 2, heights=c(4,4), common.legend = TRUE)

# save
ggsave("Figures/Expl_figure1_UserResponses_IAZ_EXP1_FINAL.pdf",width = 5, height = 6,)

# show
print("Display figures showing user responses in relevant survey items:")
Expl_figure1_UserResponses

```


Check for significant differences between groups using the Wilcoxon–Mann–Whitney U test, as we have Likert-scale data.

```{r echo=FALSE, fig.height = 2, fig.width = 5, fig.align = "center"}

# extract response numbers per participant
df_survey_Inconsist_responses = df_survey_Inconsist[!df_survey_Inconsist$checked==0,]
# take out "prefer not to answer" replies
df_survey_Inconsist_responses = df_survey_Inconsist_responses[!df_survey_Inconsist_responses$responseNo==6,]

# treat as numeric to compute stats
df_survey_Inconsist_responses$responseNo=as.numeric(df_survey_Inconsist_responses$responseNo)

# compute summary stats (mean, sd, sem)
df_survey_Inconsist_summary=data_summary(df_survey_Inconsist_responses, varname="responseNo", 
                    groupnames=c("group"))

# for plotting:
df_survey_Inconsist_responses$responseNo=as.numeric(df_survey_Inconsist_responses$responseNo)

# plot distribution
Expl_figure2_UserResponses_violin = ggplot(df_survey_Inconsist_responses, aes(x=group, y=responseNo, color=group, fill=group)) +
  geom_violin(scale = "count")+
  geom_boxplot(width=0.1)+
  #geom_jitter(height = 0, width = 0.1)+ # ADD THIS TO DISPLAY INDIVIDUAL DATA POINTS!
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`8`="\"I found inconsistencies.\""))) +
  stat_summary(fun=mean, geom="point", shape=23, size=2, color="red")  +
  labs(title="Mean responses re. inconsistencies of CFEs",x="Group", y = "Response")+
  scale_y_continuous(limits = c(1, 5),breaks=1:5, labels=c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  theme_bw(base_size = 10)+
  scale_fill_manual("Group", values=alpha(c(Ccol,Pcol),0.2), labels = c("Control", "CFE"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  theme(plot.title = element_text(hjust = 0.5))

# save
ggsave("Figures/Expl_figure2_UserResponses_violin_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# Stacked bar plot, the more appropriate visualization for likert data:
# separate "high" and "low" values
lows=likert_Inconsist[as.numeric(likert_Inconsist$responseNo)<4,]
highs=likert_Inconsist[as.numeric(likert_Inconsist$responseNo)>2,]
# NOTE: we take response no 6 out here, after percentages were computed
# THUS, THE RESPONSES WILL NOT SUM UP TO 100% ID PEOPLE CHOSE NOT TO ANSWER!
highs=highs[!highs$responseNo==6,]

# divide neutral category by 2 for plotting:
lows$PercUsersChecked[lows$responseNo==3]=lows$PercUsersChecked[lows$responseNo==3]/2
highs$PercUsersChecked[highs$responseNo==3]=highs$PercUsersChecked[highs$responseNo==3]/2

Expl_figure2_UserResponses_stackedBar = ggplot() + scale_x_discrete(labels=c("E"="CFE","C"="Control")) +
  # annotation:
  annotate("rect", xmin = 1.6, xmax = 2.4, ymin = -100, ymax = 100, alpha=0.2, fill = Pcol)+
  annotate("rect", xmin = 0.6, xmax = 1.4, ymin = -100, ymax = 100, alpha=0.2, fill = Ccol)+
  # geom_bar, add data high:
  geom_bar(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y=PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=highs[order(highs$responseNo, decreasing = F),], aes(x = group, y = PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # geom_bar, add data low:
  geom_bar(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked, fill=responseNo, group=group), position="stack", stat="identity",width = 0.5) +
  # add border for high values:
  stat_summary(data=lows[order(lows$responseNo, decreasing = T),], aes(x = group, y=-PercUsersChecked),
               fun = sum,geom = "col",colour = "black",fill=NA,width = 0.5,size=0.1)+
  # flip coordinates:
  coord_flip(xlim=c(0.99,2.3)) +
  # more aesthetics:
  geom_hline(yintercept = 0, color =c("black"),size=0.1) +
  facet_wrap(vars(itemNo),nrow = 2, ncol = 1,labeller=as_labeller(c(`8`="\"I found inconsistencies.\"")))+
  theme_bw(base_size = 10)+
theme(legend.position = "bottom",legend.title=element_blank(),legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"),plot.title = element_text(hjust = 0.5))+
  labs(title="Detected inconsistencies in CFEs",x="Group", y ="% of users")+
  scale_fill_manual("User response", values = likert_Okabe_Ito_palette,labels = c("Strongly\ndisagree","Disagree","Neutral","Agree","Strongly\nagree"))+
  scale_color_manual("Group", values=c(Ccol,Pcol), labels = c("Control", "CFE"))+
  scale_y_continuous(labels=c(100,50,0,50,100))

# save
ggsave("Figures/Expl_figure2_UserResponses_stackedBar_IAZ_EXP1_FINAL.pdf",width = 6, height = 2,)


# # show plot
# print("Mean user response for inconsistencies of CFEs:")
# Expl_figure2_UserResponses_violin
# Expl_figure2_UserResponses_stackedBar

# run stats, helpfulness judgements:
item8t="Wilcox"
item8test=uk_wilcox.test(df_survey_Inconsist_responses$responseNo[df_survey_Inconsist_responses$group=="E"],df_survey_Inconsist_responses$responseNo[df_survey_Inconsist_responses$group=="C"],paired=FALSE,exact=FALSE)
item8effsize=item8test$z_val/(sqrt(nrow(df_survey_Inconsist_responses)))

matchingRes=paste(matchingRes,paste("\n","Expl_UserJudgementsInconsistencies",sep=""),"NA",item8t,item8test$p.value,item8effsize,sep = ",")

```

The analysis revealed:

* Is there a significant difference in terms of found inconsistencies in the feedback provided? We compared answers of users in explanation condition (M = `r df_survey_Inconsist_summary$mean[df_survey_Inconsist_summary$group=="E"]`, SEM = `r df_survey_Inconsist_summary$sem[df_survey_Inconsist_summary$group=="E"]`) and users in the control condition (M = `r df_survey_Inconsist_summary$mean[df_survey_Inconsist_summary$group=="C"]`, SEM = `r df_survey_Inconsist_summary$sem[df_survey_Inconsist_summary$group=="C"]`) using a Wilcoxon–Mann–Whitney U test. This showed: U=`r item8test$statistic `, p=`r item8test$p.value `, r = `r item8effsize `

```{r echo=FALSE, fig.height = 3, fig.width = 5, fig.align = "center"}
# add significance annotation to likert plot:

anno_df <- data.frame(label = c("n.s. (p=.232)"),itemNo   = c(8))
Expl_figure2_UserResponses_stackedBar_anno=Expl_figure2_UserResponses_stackedBar + geom_text(
  data = anno_df, mapping = aes(x = Inf, y = -Inf, label = label), hjust = -0.1, vjust = 1.5, size=2, fontface = "italic")

# save
ggsave("Figures/Expl_figure2_UserResponses_stackedBar_anno_IAZ_EXP1_FINAL.pdf",width = 6, height = 4,)

# show plot
print("Mean user response for inconsistencies of CFEs:")
Expl_figure2_UserResponses_stackedBar_anno

```

## Survey data: Final plot for publication

```{r echo=FALSE, fig.height = 7, fig.width = 7, fig.align = "center"}
#make a final big figure of survey data for publication

Expl_figure2_UserResponses_stackedBar_anno$labels$title=c()
p3=Expl_figure2_UserResponses_stackedBar_anno+theme(title=element_blank(),
        legend.position = "none",
        panel.spacing = unit(1, "lines"))

p3=ggarrange(p3,"","",ncol = 1, nrow = 3, heights=c(4,2.5), common.legend = TRUE, legend="bottom")

H2_H3_expl_survey_figure=ggarrange(H2_p_UserJudgements,H3_p_UserJudgements,p3, ncol = 3, nrow = 1, common.legend = TRUE, legend="bottom") # heights=c(4,2.5)

# save
ggsave("Figures/H2_H3_expl_survey_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

### transposed version
# add labels to rows
H2_p_UserJudgements_T_anno=annotate_figure(H2_p_UserJudgements_T,
                bottom = text_grob("% of users", hjust = c(-0.2),vjust = c(-0.1), size = 10),
)

H3_p_UserJudgements_T_anno=annotate_figure(H3_p_UserJudgements_T,
                bottom = text_grob("% of users", hjust = c(-0.2),vjust = c(-0.1), size = 10),
)

Expl_figure2_UserResponses_stackedBar_anno$labels$title=c()
p3=Expl_figure2_UserResponses_stackedBar_anno+theme(legend.position = "none",
        panel.spacing = unit(1, "lines"))

leg <- get_legend(Expl_figure2_UserResponses_stackedBar_anno)

p3_T=ggarrange(p3,leg,ncol = 2, nrow = 1,widths=c(1.3,2))

H2_H3_expl_survey_figure_T=ggarrange(H2_p_UserJudgements_T_anno,H3_p_UserJudgements_T_anno,p3_T, ncol = 1, nrow = 3, common.legend = TRUE, legend="bottom",labels = c("a","b","c"), label.x=-0.001,label.y=1.04,font.label = list(size = 16),align = c("v")) # heights=c(4,2.5)

ggsave("Figures/H2_H3_expl_survey_T_IAZ_EXP1_FINAL.pdf",width = 8,height = 4) #width = 7, height = 2,

# # show plot
print("Final summary plot for survey data:")
H2_H3_expl_survey_figure_T

```

# Wrapping up

```{r echo=FALSE}
# Wrapping up: write statistics of bivariate comparisons etc.

# delete file if already there:
#Check its existence
if (file.exists(paste("BivariateStats_IAZ_EXP1_FINAL.csv",sep=""))) {
  #Delete file if it exists
  file.remove(paste("BivariateStats_IAZ_EXP1_FINAL.csv",sep=""))
}

tmpFile=file(paste("BivariateStats_IAZ_EXP1_FINAL.csv",sep=""), "w+")
writeLines(matchingRes,tmpFile)
close(tmpFile)

```

# References