######################
#Distribution plots to look at your data
######################

#read in the data
d=read.csv("results.no.outliers.csv")

#load some packages
library(ggplot2)
library(dplyr)


#One thing we haven't talked about yet is the order of levels in a factor. By default, R will use a convenience order (something like alphabetical). If you want a different order, you have to specify it.
#The order of the levels matters for plotting and for statistical analyses (if you use contrast coding... more on this later).
#So let's specify the order of the levels of our two factors: embedded.structure and dependency.length 
d$embeddedStructure=factor(d$embeddedStructure, levels=c("non","isl"))

d$dependencyLength=factor(d$dependencyLength, levels=c("sh","lg"))

d$condition = factor(d$condition, levels = c("wh.non.sh", "wh.non.lg", "wh.isl.sh", "wh.isl.lg"))

#We can also change the names of levels if we want. This matters for plotting more than anything else, because R will use the level names in the plot.
#To use the levels() function this way, you have to rename all of the levels in the factor. You can't rename just one. You have to use a different syntax to do that.
levels(d$embeddedStructure) = c("non-island", "island")

levels(d$dependencyLength) = c("short", "long")


#The first thing we need to do is remove the fillers, because we aren't interested in them anymore
#We could do this with a subset() function, but instead we will use the filter() function from dplyr, which makes things even easier
d2 = filter(d, island =="wh")


#The next thing we need to do is create an average of each condition for each subject (because each subject saw two tokens per condition)
#We could do this with aggregrate() that is built into R; but there are special commands in the dplyr package that make this even easier
#I am also going to introduce a new component of dplyr --- chaining. Basically, you can run commands in sequence by using a special operator %>%
#The first step is to name the dataset to work on, then use the %>% operator to define the steps you want to take

subject.means = d2 %>%
	group_by(subject, condition, island, embeddedStructure, dependencyLength) %>%
	summarize(zscores = mean(zscores, na.rm=TRUE))


#I am going to use something called faceting to make these graphs prettier. Faceting is just a special way of arranging multiple plots based on factors in your experimental design. Basically, the columns of the arrangment are the levels of one factor, and the rows are the level of another factor. Once you are comfortable with ggplot, you should learn how to facet. It is very useful when you have complex experimental designs.

#histogram arranged by factor/level
h = ggplot(subject.means, aes(x=zscores))+ 
	geom_histogram(binwidth=.25, color="black", fill="white") + 
	facet_grid(embeddedStructure ~ dependencyLength)+
	ylab("density")+
	xlab("zscores")+ 
	theme(axis.title.x = element_text(vjust=-0.5))+ 
	theme(axis.title.y = element_text(vjust=1))+
	guides(fill = FALSE)+
	theme(legend.position="bottom") 
	#+theme_minimal() 


#density plot arranged by factor/level
d = ggplot(subject.means, aes(x=zscores))+ 
	geom_density() + 
	facet_grid(embeddedStructure ~ dependencyLength)+ 
	ylab("density")+ 
	xlab("zscores") + 
	theme(axis.title.x = element_text(vjust=-0.5)) + 
	theme(axis.title.y = element_text(vjust=1))+ 
	guides(fill = FALSE) + 
	theme(legend.position="bottom") 
	#+theme_minimal() 

#density + histogram, y-axis is frequency AND density --- this will flatten the density curve, because the frequency scale is usually much larger than the density scale
b.frequency = ggplot(subject.means, aes(x=zscores))+
	geom_histogram(binwidth=.25, color="black", fill="white")+
	geom_density() + 
	facet_grid(embeddedStructure ~ dependencyLength)+ 
	ylab("density")+ 
	xlab("zscores") + 
	theme(axis.title.x = element_text(vjust=-0.5)) + 
	theme(axis.title.y = element_text(vjust=1))+ 
	guides(fill = FALSE) + 
	theme(legend.position="bottom") 
	#+theme_minimal() 

#density + histogram, y-axis is density
b.density = ggplot(subject.means, aes(x=zscores))+ 
	geom_histogram(aes(y = ..density..), binwidth=.25, color="black", fill="white") + 
	geom_density() + 
	facet_grid(embeddedStructure ~ dependencyLength)+ 
	ylab("density")+ 
	xlab("zscores") + 
	theme(axis.title.x = element_text(vjust=-0.5)) + 
	theme(axis.title.y = element_text(vjust=1))+ 
	guides(fill = FALSE) + 
	theme(legend.position="bottom") 
	#+theme_minimal() 

#This is arranged by condition in a column so you can see the differences in the distirbutions
b.vertical = ggplot(subject.means, aes(x=zscores))+ 
	geom_histogram(aes(y = ..density..), binwidth=.25, color="black", fill="white") + 
	geom_density() + 
	facet_grid(condition ~ .)+ 
	ylab("density")+ 
	xlab("zscores") + 
	theme(axis.title.x = element_text(vjust=-0.5)) +
	theme(axis.title.y = element_text(vjust=1))+ 
	guides(fill = FALSE) + 
	theme(legend.position="bottom") 
	#+theme_minimal() 


#If you want to specify the size of the plot (e.g. for publication) you do that by specifying the size of the window that you are going to open. On a mac, this is done witht he quartz() function. On windows, it is the windows() function.
#Pro tip: Plots look best if they are in a golden ratio. You can roughly get this by making the window a golden ratio (e.g, 10 x 6.18); however, there is stuff on the borders of graphs, so this won't be perfect.

quartz(
width=10,
height=6.18
)
b.vertical

#You can save a ggplot to pdf using ggsave(). This will dump whatever is the most recent ggplot into a pdf. Note: This does not dump the active window, it dumps the last ggplot command. If you want to save from the active window, you can use the pdf() command.
ggsave("distribution.plot.pdf")