##################################################################
### --- R script "Acquisition of Intensifiers. Part 1"
### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport"
### --- NOTE
### --- CONTACT
### --- martin.schweinberger.hh@gmail.com
### --- If you have questions,suggestions or you found errors
### --- or in case you would to provide feedback, questions
### --- write an email to
### --- CITATION
### --- If you use this script or results thereof, please cite it as:
### --- Schweinberger, Martin. 2015. "The acquisition of intensifiers",
### --- unpublished R script, Hamburg University.
###############################################################
###                   START
###############################################################
# Remove all lists from the current workspace
rm(list=ls(all=T))
# Install packages
library(tm)
library(stringr)
library(plyr)
library(car)
library(RLRsim)
library(car)
library(QuantPsyc)
library(boot)
library(plyr)
library(rms)
library(tm)
library(plyr)
library(reshape)
library(zoo)
library(Hmisc)
library(gsubfn)
library(reshape)
source("C:\\R/ConcR_2.3.r")
source("C:\\R/POStagObject_02.R") # for pos-tagging objects
###############################################################
# Setting options
options(stringsAsFactors = F)
# define image directors
imageDirectory<-"C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images"
# Specify pathnames of the corpra
corpus <- "C:\\Corpora\\original\\CHILDESunzipped\\Eng-NA-MOR\\HSLLD"
bio.path <- "C:\\03-MyProjects\\07IntensifierAcquisition/biowc.txt"
#int.path <- "C:\\03-MyProjects\\07IntensifierAcquisition/int.txt"
###############################################################
###                   START
###############################################################
# Prepare for loading corpus
# Choose the files
cha = list.files(path = corpus, pattern = ".cha$", all.files = T,
  full.names = T, recursive = T, ignore.case = T, include.dirs = T)
###############################################################
# extract biodata
#cha <- cha[1:5]
bio <- sapply(cha, function(x) {
  file <- gsub("C:\\Corpora\\original\\CHILDESunzipped\\Eng-NA-MOR\\HSLLD/", "", x, fixed = T)
  file <- gsub(".cha", "", file, fixed = T)
  x <- scan(x, what = "char", sep = "\t", quiet = T, quote = "", skipNul = T)
  x <- gsub("\n", " ", x, fixed = T)
  x <- gsub("\t", " ", x, fixed = T)
  x <- str_trim(x, side = "both")
  x <- gsub(" {2,}", " ", x)
  x <- paste(x, collapse = " ", sep = " ")
  x <- str_replace_all(x, "\\*.*", "")
  rps <- str_count(x, "@ID")
  # name
  nam <- str_replace_all(file, ".*/", "")
  # id
  pid <- str_replace_all(x, ".*@PID", "@PID")
  # pid
  pid <- str_replace_all(x, ".*@PID: ", "")
  pid <- str_replace_all(pid, "@.*", "")
  pid <- str_trim(pid, side = "both")
  pid <- gsub(" {2,}", " ", pid)
  # languages
  lan <- str_replace_all(x, ".*@Languages: ", "")
  lan <- str_replace_all(lan, "@.*", "")
  lan <- str_trim(lan, side = "both")
  lan <- gsub(" {2,}", " ", lan)
  # participants
  par <- str_replace_all(x, ".*@Participants: ", "")
  par <- str_replace_all(par, "@.*", "")
  par <- str_trim(par, side = "both")
  par <- gsub(" {2,}", " ", par)
  # birth of child
  bir <- str_replace_all(x, ".*@Birth of CHI: ", "")
  bir <- str_replace_all(bir, "@.*", "")
  bir <- str_trim(bir, side = "both")
  bir <- gsub(" {2,}", " ", bir)
  # comment
  com <- str_replace_all(x, ".*@Comment: ", "")
  com <- str_replace_all(com, "@.*", "")
  com <- str_trim(com, side = "both")
  com <- gsub(" {2,}", " ", com)
  # date
  dat <- str_replace_all(x, ".*@Date: ", "")
  dat <- str_replace_all(dat, "@.*", "")
  dat <- str_trim(dat, side = "both")
  dat <- gsub(" {2,}", " ", dat)
  # location
  loc <- str_replace_all(x, ".*@Location: ", "")
  loc <- str_replace_all(loc, "@.*", "")
  loc <- str_trim(loc, side = "both")
  loc <- gsub(" {2,}", " ", loc)
  # situation
  sit <- str_replace_all(x, ".*@Situation: ", "")
  sit <- str_replace_all(sit, "@.*", "")
  sit <- str_trim(sit, side = "both")
  sit <- gsub(" {2,}", " ", sit)
  # activities
  act <- str_replace_all(x, ".*@Activities: ", "")
  act <- str_trim(act, side = "both")
  act <- gsub(" {2,}", " ", act)
  # ids
  ids <- as.vector(unlist(strsplit(gsub("(@ID)", "~~~\\1", x), "~~~")))
  ids <- as.vector(unlist(ids[2:length(ids)]))
  ids <- str_replace_all(ids, "@ID: ", "")
  ids <- str_replace_all(ids, "@.*", "")
  ids <- str_replace_all(ids, "\\|{2,}", "|")
  ids <- str_replace_all(ids, "eng\\|HSLLD\\|", "")
  ids <- gsub("\\|$", "", ids, perl = T)
  # agechild
  age <- str_replace_all(ids, "CHI\\|", "")
  age <- str_replace_all(age, "\\|.*", "")
  age <- str_replace_all(age, "[A-Z]{2,}[0-9]{0,2}", "NA")
  # genderchild
  sex <- str_replace_all(ids, ".*female", "female")
  sex <- str_replace_all(sex, ".*\\|male", "male")
  sex <- str_replace_all(sex, "male.*", "male")
  sex<- as.vector(unlist(sapply(sex, function(x){
    x <- ifelse(nchar(x) > 8, "NA", x)})))
  # ids 2
  ids <- str_replace_all(ids, "[0-9]{1,2};[0-9]{1,2}.{0,1}[0-9]{0,2}\\|{0,1}[a-z]{0,6}\\|", "")
  ids <- str_trim(ids, side = "both")
  ids <- str_replace_all(ids, "\\|$", "")
  # spk
  spk <- str_replace_all(ids, "\\|.*", "")
  # create a table of results
  vec <- c(file, pid, nam, lan, par, bir, com, dat, loc, sit, act, x)
  cntnt <- c(rep(vec, rps))
  df <- matrix(cntnt, ncol= 12, byrow = T)
  mt <- cbind(ids, spk, age, sex, df)
  colnames(mt) <- c("id", "spk", "age", "gender", "file", "pid", "name", "language", "participants", "birth", "comment", "date", "location", "situation", "actvities", "all")
  return(mt)
  })
# collapse all tables into 1 table
biotb <- do.call("rbind", bio)
biotb <- as.data.frame(biotb)
# inspect data
head(biotb)

###############################################################
# count words for each speaker
wc <- sapply(cha, function(x) {
  file <- gsub("C:\\Corpora\\original\\CHILDESunzipped\\Eng-NA-MOR\\HSLLD/", "", x, fixed = T)
  file <- gsub(".cha", "", file, fixed = T)
  x <- scan(x, what = "char", sep = "\t", quiet = T, quote = "", skipNul = T)
  x <- gsub("\n", " ", x, fixed = T)
  x <- gsub("\t", " ", x, fixed = T)
  x <- str_trim(x, side = "both")
  x <- gsub(" {2,}", " ", x)
  orig <- paste(x, collapse = " ", sep = " ")
  full <- strsplit(gsub("([*]{1,1}[A-Z]{2,4}[0-9]{0,1}:)", "~~~\\1", orig), "~~~")
  full2 <- unlist(full)
  meta <- full2[1]
  full2 <- full2[2:length(full2)]
  x <- strsplit(gsub("([%|*][a-z|A-Z]{2,4}[0-9]{0,1}:)", "~~~\\1", orig), "~~~")
  txt <- sapply(x, function(y){
    y <- str_replace_all(y, "%.*", "")
    y <- str_replace_all(y, "@.*", "")
    y <- str_replace_all(y, "\\.|\\?|;|,|\\[|\\]|!|\\+|\\(|\\)|<|>|[0]", "")
    y <- str_trim(y, side = "both")
    y <- gsub(" {2,}", " ", y)
    y <- y[y != ""] })
  x <- str_split(txt, ":")
  spk <- sapply(x, function(y){
    y <- y[1] })
  file <- as.vector(unlist(rep(file, length(spk))))
  wrds <- sapply(x, function(y){
    y <- y[2]
    y <- str_trim(y, side = "both")
    y <- gsub(" {2,}", " ", y)
    })
  cnt <- sapply(x, function(y){
    y <- y[2]
    y <- str_trim(y, side = "both")
    y <- gsub(" {2,}", " ", y)
    y <- str_split(y, " ")
    y <- sapply(y, function(z){ z <- ifelse(z == "", 0, length(z)) })
    y <- unique(y)
    })
  spk <- gsub("*", "", spk, fixed = T)
  df <- matrix(data = c(file, full2, txt, spk, wrds, cnt, rep(meta, length(txt))), ncol = 7, byrow = F)
  })
# collapse all tables into 1 table
wc <- do.call("rbind", wc)
wc <- as.data.frame(wc)
colnames(wc) <- c("file", "full", "spkutt", "spk", "utt", "wc", "meta")
# inspect data
head(wc)

###############################################################
# save data to disc
write.table(wc, file = "C:\\03-MyProjects\\07IntensifierAcquisition/wc.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
###############################################################
# read data from disc
wc <- read.table("C:\\03-MyProjects\\07IntensifierAcquisition/wc.txt", sep = "\t", header = T, quote = "", comment.char = "")
# clean read data
wc$utt <- gsub("\"", "", wc$utt, fixed = T)
# clean colnames
colnames(wc) <- gsub("X.", "", colnames(wc), fixed = T)
colnames(wc) <- gsub(".", "", colnames(wc), fixed = T)
# clean data
wc$utt <- as.vector(unlist(sapply(wc$utt, function(x){
  x <- gsub("/", "", x, fixed = T)
  x <- gsub("=", "", x, fixed = T)
  x <- gsub("&", "", x, fixed = T)
  x <- gsub("*", "", x, fixed = T)
  x <- gsub("^", "", x, fixed = T)
  x <- gsub("_", "", x, fixed = T)
  x <- gsub("€", "", x, fixed = T)
  x <- gsub("†", "", x, fixed = T)
  x <- gsub("ā", "", x, fixed = T)
  x <- gsub("˛", "", x, fixed = T)
  x <- gsub("‘", "'", x, fixed = T) } )))
# inspect data
head(wc)

###############################################################
# split data into smaller chunks
pos01 <- wc$utt[1:50000]
pos02 <- wc$utt[50001:100000]
pos03 <- wc$utt[100001:150000]
pos04 <- wc$utt[150001:200000]
pos05 <- wc$utt[200001:250000]
pos06 <- wc$utt[250001:300000]
pos07 <- wc$utt[300001:nrow(wc)]
# pos tagging data
hslldpos01 <- POStag(object = pos01)
hslldpos01 <- as.vector(unlist(hslldpos01))
writeLines(hslldpos01, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos01.txt", sep = "\n", useBytes = FALSE)
###
hslldpos02 <- POStag(object = pos02)
hslldpos02 <- as.vector(unlist(hslldpos02))
writeLines(hslldpos02, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos02.txt", sep = "\n", useBytes = FALSE)
###
hslldpos03 <- POStag(object = pos03)
hslldpos03 <- as.vector(unlist(hslldpos03))
writeLines(hslldpos03, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos03.txt", sep = "\n", useBytes = FALSE)
###
hslldpos04 <- POStag(object = pos04)
hslldpos04 <- as.vector(unlist(hslldpos04))
writeLines(hslldpos04, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos04.txt", sep = "\n", useBytes = FALSE)
###
hslldpos05 <- POStag(object = pos05)
hslldpos05 <- as.vector(unlist(hslldpos05))
writeLines(hslldpos05, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos05.txt", sep = "\n", useBytes = FALSE)
###
hslldpos06 <- POStag(object = pos06)
hslldpos06 <- as.vector(unlist(hslldpos06))
writeLines(hslldpos06, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos06.txt", sep = "\n", useBytes = FALSE)
###
hslldpos07 <- POStag(object = pos07)
hslldpos07 <- as.vector(unlist(hslldpos07))
writeLines(hslldpos07, con = "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos07.txt", sep = "\n", useBytes = FALSE)
###############################################################
# list pos tagged elements
postag.files = c("C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos01.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos02.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos03.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos04.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos05.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos06.txt",
  "C:\\03-MyProjects\\07IntensifierAcquisition/hslldpos07.txt")
# load pos tagged elements
hslldpos <- as.vector(unlist(sapply(postag.files, function(x) {
  x <- scan(x, what = "char", sep = "\n", quote = "", quiet = T, skipNul = T)
  x <- gsub(" {2,}", " ", x)
  x <- str_trim(x, side = "both")
  x <- str_replace_all(x, fixed("\n"), " ")
  } )))
# unlist pos tagged elements and add to data frame
wc$hslldpos <- hslldpos
# clean data frame
wc4 <- apply(wc, 2, function(x){
  x <- str_replace_all(x, fixed("\""), "") } )
# convert into data frame
df5 <- as.data.frame(wc4)
###############################################################
# save data before pos tagging
write.table(df5, "C:\\03-MyProjects\\07IntensifierAcquisition/hslld_postagged.txt", sep = "\t", row.names = F, col.names = T)
# read data for pos tagging
df5 <- read.table("C:\\03-MyProjects\\07IntensifierAcquisition/hslld_postagged.txt", sep = "\t", header = T, quote = "", comment.char = "")
# clean data frame
df5 <- apply(df5, 2, function(x){
  x <- str_replace_all(x, fixed("\""), "") } )
# convert into data frame
df5 <- as.data.frame(df5)
# add id column
df5$id <- 1:nrow(df5)
# clean colnames
colnames(df5) <- c("fl", "cntnt", "spkutt", "spk", "utt", "wc", "meta", "pos", "id")
# clean postagged utts
df5$pos <- as.vector(unlist(sapply(df5$pos, function(x){
  x <- gsub("/''", "", x, fixed = T)
  x <- gsub("/``", "", x, fixed = T)
  x <- str_trim(x, side = "both")
  x <- gsub(" {2,}", " ", x)} )))
# save data before pos tagging
write.table(df5, "C:\\03-MyProjects\\07IntensifierAcquisition/hslld_postagged_2.txt", sep = "\t", row.names = F, col.names = T)
# read data for pos tagging
df5 <- read.table("C:\\03-MyProjects\\07IntensifierAcquisition/hslld_postagged_2.txt", sep = "\t", header = T, quote = "", comment.char = "")
# clean data frame
df5 <- apply(df5, 2, function(x){
  x <- str_replace_all(x, fixed("\""), "") } )
# convert into data frame
df5 <- as.data.frame(df5)
# clean colnames
colnames(df5) <- c("fl", "cntnt", "spkutt", "spk", "utt", "wc", "meta", "pos", "id")
# attach file subfile speaker id to pos-tagged string
df5$spksupos <- as.vector(unlist(apply(df5, 1, function(x){
  x <- paste(x[4], ": ", x[8], sep = "", collapse = "")
  } )))
# repair file names
df5$fl <- as.vector(unlist(sapply(df5$fl, function(x){
  x <- gsub("/", "-", x, fixed = T) } )))
# split file subfile speaker pos-tagged su by file subfile
hslld.tmp1 <- by(df5$spksupos, df5$fl, paste)
# rename data
hslld <- hslld.tmp1
# save result to disc
lapply(seq_along(hslld), function(i)writeLines(text = unlist(hslld[i]),
    con = paste("C:\\Corpora\\edited\\HSLLD-PosTagged/", names(hslld)[i],".txt", sep = "")))
###############################################################
# tabulate wc
wctb <- data.frame(wc$file, wc$spk, wc$wc)
# clean colnames
colnames(wctb) <- str_replace_all(colnames(wctb), "wc.", "")
# clean content
wctb$file <- gsub("\"", "", wctb$file, fixed = T)
wctb$spk <- gsub("\"", "", wctb$spk, fixed = T)
wctb$wc <- gsub("\"", "", wctb$wc, fixed = T)
# combine file and speaker
wctb$filespk <- as.vector(unlist(apply(wctb, 1, function(x){
  x <- paste(x[1], "$", x[2], collapse = "")
  x <- str_replace_all(x, " ", "") })))
wctb$wc <- gsub("\"", "", wctb$wc, fixed = T)
wctb$wc <- as.numeric(wctb$wc)
wctb2 <- ddply(wctb, "wctb$filespk", transform, wc=sum(wc))
wctb2 <- subset(wctb2, !duplicated(filespk))
# combine bio and wc
biowc <- join(biotb, wctb2, by = c("file", "spk"), type = "left")
# inspect data
head(biowc)

# update gender column
biowc$gender <- with(biowc, ifelse(biowc$spk =="MOT", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SIS", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SIS1", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SIS2", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SIS3", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SI1", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SI2", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="SI3", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="WOM", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="ANT", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="AUN", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="GMA", "female", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="FAT", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="UNC", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BRO", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BRO1", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BRO2", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BR1", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BR2", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BR3", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BR3", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="BAB", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="GFA", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="GPA", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="MAN", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="MAL", "male", biowc$gender))
biowc$gender <- with(biowc, ifelse(biowc$spk =="MAL2", "male", biowc$gender))
# create a column holding the socio economic status (ses)
biowc$ses <- biowc$comment
biowc$ses <- str_replace_all(biowc$ses, ".*SES", "")
biowc$ses <- str_replace_all(biowc$ses, ".*lower.*", "lower")
biowc$ses <- str_replace_all(biowc$ses, ".*middle.*", "middle")
biowc$ses <- as.vector(unlist(sapply(biowc$ses, function(x){
  x <- ifelse(x == "lower", "lower",
    ifelse(x == "middle", "middle", NA)) })))
# save results to disc
write.table(biowc, file = "C:\\03-MyProjects\\07IntensifierAcquisition/biowc.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
###############################################################
###                    SEARCH
###############################################################
# set parameters
pathname <- "C:\\Corpora\\edited\\HSLLD-PosTagged"
context <- 80
all.pre = T
# define search strings
search.pattern <-  c("[A-Z]{0,1}[a-z]{0,}-{0,1}[a-z]{2,}\\/JJ")
##########################################################
# start searches
int <- ConcR(pathname, search.pattern, context, all.pre = T)
# extract speaker
int$all.pre <- gsub(".*([A-Z]{2,}_{0,1}[0-9]{0,1}:)", "\\1", int$all.pre)
int$all.pre <- str_replace_all(int$all.pre, ":.*", "")
# code status
int$status <- as.vector(unlist(sapply(int$post, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(" .*", "", x)
  x <- gsub(".*\\/", "", x)
  x <- gsub("NNS", "NN", x)
  x <- ifelse(x == "NN", "attr", "pred")
  } )))
# code previous element (pos)
int$prepos <- as.vector(unlist(sapply(int$pre, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(".*\\/", "", x)
  x <- gsub(" .*", "", x) } )))
# code previous element (lexeme)
int$prelex <- as.vector(unlist(sapply(int$pre, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(".* ", "", x)
  x <- gsub("\\/.*", "", x) } )))
# clean file
int$file <- as.vector(unlist(sapply(int$file, function(x){
  x <- gsub(".txt", "", x, fixed = T)
  x <- gsub("-", "/", x, fixed = T) } )))
# extract all attributive adjectives
int <- int[int$status == "attr",]
# code intensifiers
int$absint <- as.vector(unlist(sapply(int$prelex, function(x){
  x <- ifelse(x == "awful", 1,
    ifelse(x == "awfully", 1,
    ifelse(x == "completely", 1,
    ifelse(x == "pretty", 1,
    ifelse(x == "radically", 1,
    ifelse(x == "real", 1,
    ifelse(x == "really", 1,
    ifelse(x == "so", 1,
    ifelse(x == "too", 1,
    ifelse(x == "rtotally", 1,
    ifelse(x == "very", 1,
    ifelse(x == "wonderfully", 1, 0)))))))))))) } )))
# inspect all pos tags preceeding attr adjectives
#table(int$prepos)
# reorganize data frames
# colnames(int)
int <- data.frame(1:nrow(int), int$file, int$file, int$file, int$file, int$all.pre, int$pre, int$token, int$post,
  int$status, int$prepos, int$prelex, int$absint, int$token)
# adapt column names
colnames(int) <- c("id", "file", "visit", "styp", "chi", "spk", "pre", "token", "post", "status", "prepos",
  "prelex", "absint", "tokenlex")
# clean visit
int$visit <- tolower(gsub("\\/.*", "", int$visit))
# clean styp
int$styp <- tolower(gsub("HV[0-9]{1,1}\\/", "", int$styp))
int$styp <- gsub("\\/.*", "", int$styp)
# clean chi
int$chi <- tolower(gsub(".*\\/", "", int$chi))
int$chi <- gsub("[a-z]{2,2}[0-9]{1,1}", "", int$chi)
# clean tokenlex
int$tokenlex <- gsub("\\/.*", "", int$tokenlex)
int$tokenlex <- str_trim(int$tokenlex, side = "both")
# inspect data
head(int)

###############################################################
# save table to disc
write.table(int, file = "C:\\03-MyProjects\\07IntensifierAcquisition/int.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
###############################################################
###                   END of PART 1
###############################################################
##################################################################
### --- R script "Acquisition of Intensifiers"
### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport"
### --- NOTE
### --- CONTACT
### --- martin.schweinberger.hh@gmail.com
### --- If you have questions,suggestions or you found errors
### --- or in case you would to provide feedback, questions
### --- write an email to
### --- CITATION
### --- If you use this script or results thereof, please cite it as:
### --- Schweinberger, Martin. 2015. "The acquisition of intensifiers",
### --- unpublished R script, Hamburg University.
###############################################################
###############################################################
###                   START
###############################################################
# Remove all lists from the current workspace
rm(list=ls(all=T))
# Install packages
#install.packages("stringr")
library(tm)
library(stringr)
library(plyr)
library(car)
library(ggplot2)
library(RLRsim)
library(car)
library(QuantPsyc)
library(boot)
library(nlme)
#detach(package:nlme)
library(lme4)
library(ez)
library(sjPlot)
library(visreg)
library(mlogit)
library(plyr)
library(rms)
library(tm)
library(stringr)
library(gsubfn)
library(plyr)
library(reshape)
library(zoo)
library(cfa)
library(Hmisc)
library(RLRsim)
library(sjPlot)
library(mlogit)
library(effects)
library(lme4)
library(languageR)
library(grDevices)
source("C:\\R/ConcR_2.3.r")
source("C:\\R/mlr.summary.R")
source("C:\\R/blr.summary.R")
source("C:\\R/meblr.summary.R")
source("C:\\R/ModelFittingSummarySWSDGLM.R") # for Mixed Effects Model fitting (step-wise step-up): Binary Logistic Mixed Effects Models
source("C:\\R/ModelFittingSummarySWSUGLM.R") # for Mixed Effects Model fitting (step-wise step-down): Binary Logistic Mixed Effects Models
source("C:\\R/ModelFittingSummarySWSULogReg.R") # for Fixed Effects Model fitting: Binary Logistic Models
###############################################################
# Setting options
options(stringsAsFactors = F)
# define image directors
imageDirectory<-"C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images"
# Specify pathnames of the corpra
corpus <- "C:\\Corpora\\original\\CHILDESunzipped\\Eng-NA-MOR\\HSLLD"
bio.path <- "C:\\03-MyProjects\\07IntensifierAcquisition/biowc.txt"
int.path <- "C:\\03-MyProjects\\07IntensifierAcquisition/int.txt"
###############################################################
### ---               PART TWO
###############################################################
###                   START
###############################################################
# read in data
bio <- read.table(bio.path, sep = "\t", header=TRUE)
int <- read.table(int.path, sep = "\t", header=TRUE)
# extract examples
exp <- int[int$absint == 1, ]
# join data
int.ac <- join(int, bio, by = c("file", "spk"), type = "left")
# inspect data
#head(int.ac)

###############################################################
# restructure data frame
int <- data.frame(int.ac$file, int.ac$visit, int.ac$styp, int.ac$spk,
  int.ac$age, int.ac$gender, int.ac$wc, int.ac$status, int.ac$prepos, int.ac$prelex,
  int.ac$tokenlex, int.ac$absint)
# clean colnames
colnames(int) <- gsub("int.ac.", "", colnames(int))
# clean age
int$age <- gsub(".", "qwertz", int$age, fixed = T)
int$age <- gsub("qwertz.*", "", int$age)
# remove speakers with less than 20 words
int.tmp5 <- int#[int$wc >=20, ]
# recode age
agem <- gsub(".*;", "", int.tmp5$age)
agem <- as.numeric(agem)
agem <- as.vector(unlist(sapply(agem, function(x){
  x <- x/12
  x <- round(x, 3)} )))
agey <- gsub(";.*", "", int.tmp5$age)
agey <- as.numeric(agey)
int.tmp5$ageedit <- agey + agem
# remove all speakers who are neither child or mother
int.tmp5$rmv <- as.vector(unlist(sapply(int.tmp5$spk, function(x){
  x <- ifelse(x == "CHI", "keep",
    ifelse(x == "MOT", "keep", "rmv")) } )))
int.tmp6 <- int.tmp5[int.tmp5$rmv == "keep", ]
# recode age
int.tmp6$agecat <- gsub("\\..*", "", int.tmp6$ageedit)
int.tmp6$agecat <- factor(int.tmp6$agecat, levels = c("3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
# rename data set
data <- int.tmp6
# recode styp
#data <- data[data$styp == "mt",]
#data$styp <- as.vector(unlist(sapply(data$styp, function(x){
#  x <- ifelse(x == "br", "formal",
#    ifelse(x == "tp", "informal",
#    ifelse(x == "mt", "informal",
#    ifelse(x == "er", "formal",
#    ifelse(x == "et", "formal",
#    ifelse(x == "lw", "formal",
#    ifelse(x == "re", "formal",
#    ifelse(x == "md", "formal", x)))))))) } )))
# remove styp MD, sytp er and styp re from data (too low frequency, er=14411, md = 1436, re = 4469)
data <- data[data$styp != "er",]
data <- data[data$styp != "md",]
data <- data[data$styp != "re",]
# remove data points above age 10 (too few instances)
data$remove <- as.vector(unlist(sapply(data$agecat, function(x){
  x <- ifelse(is.na(x) == T, "keep",
  ifelse(x == "11", "rmv",
  ifelse(x == "12", "rmv", "keep"))) } )))
data <- data[which(data$remove == "keep"), ]
data$agecat <- factor(data$agecat, levels = c("3", "4", "5", "6", "7", "8", "9", "10"))
# selcet mt data
datamt <- data[data$styp == "mt", ]
databr <- data[data$styp == "br", ]
datalw <- data[data$styp == "lw", ]
# remove test columns
data <- data[,c(1:13, 15)]
# inspect data
head(data)

# create data of children only
chint <- data[data$spk == "CHI",]
chintbr <- databr[databr$spk == "CHI",]
chintmt <- datamt[datamt$spk == "CHI",]
chintlw <- datalw[datalw$spk == "CHI",]
# create data of mothers only
motint <- data[data$spk == "MOT",]
###############################################################
###############################################################
###############################################################
### ---               VIZUALIZATION
###############################################################
###############################################################
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepct.png") # save plot
# tabulate data
inttb01 <- table(chint$absint, chint$agecat)
nchi <- ftable(chint$spk, chint$agecat)
nchi <- colSums(nchi)
# calculate percentages of intensified slots
pcnt <- round(inttb01[2,]/(inttb01[1,]+inttb01[2,])*100, 2)
# plot data
barplot(pcnt, col = "grey80", ylim = c(-2, 10), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers against age")
text(seq(0.7, 9.1, 1.2), pcnt + 1, cex = .8, labels = pcnt)
text(seq(0.7, 9.1, 1.2), -1, labels = c(paste("N(slots):\n", nchi[1], sep = ""),
  nchi[2: length(nchi)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                      MT
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepctmt02.png") # save plot
# tabulate data
inttb02 <- table(chintmt$absint, chintmt$agecat)
nchi02 <- ftable(chintmt$spk, chintmt$agecat)
nchi02 <- colSums(nchi02)
# calculate percentages of intensified slots
pcnt02 <- round(inttb02[2,]/(inttb02[1,]+inttb02[2,])*100, 2)
# plot data
barplot(pcnt02, col = "grey80", ylim = c(-1, 10), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers\nagainst age in meal time settings")
text(seq(0.7, 9.1, 1.2), pcnt02 + 1, cex = .8, labels = pcnt02)
text(seq(0.7, 9.1, 1.2), -.5, labels = c(paste("N(slots):\n", nchi02[1], sep = ""),
  nchi02[2: length(nchi02)]), cex = .8)
grid()
box()
dev.off()
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepctmt03.png") # save plot
# tabulate data
inttb03 <- table(chintmt$absint, chintmt$agecat)
inttb03 <- inttb03[, colSums(inttb03) > 100]
nchi03 <- ftable(chintmt$spk, chintmt$agecat)
nchi03 <- colSums(nchi03)
# calculate percentages of intensified slots
pcnt03 <- round(inttb03[2,]/(inttb03[1,]+inttb03[2,])*100, 2)
# plot data
barplot(pcnt03, col = "grey80", ylim = c(-1, 10), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers\nagainst age in meal time settings")
text(seq(0.7, 9.1, 1.2), pcnt03 + 1, cex = .8, labels = pcnt03)
text(seq(0.7, 9.1, 1.2), -.5, labels = c(paste("N(slots):\n", nchi03[1], sep = ""),
  nchi03[2: length(nchi03)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                      BR
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepctbr02.png") # save plot
# tabulate data
inttb04 <- table(chintbr$absint, chintbr$agecat)
nchi04 <- ftable(chintbr$spk, chintbr$agecat)
nchi04 <- colSums(nchi04)
# calculate percentages of intensified slots
pcnt04 <- round(inttb04[2,]/(inttb04[1,]+inttb04[2,])*100, 2)
pcnt04 <- pcnt04[is.na(pcnt04) == F]
# plot data
barplot(pcnt04, col = "grey80", ylim = c(-1, 10), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers\nagainst age in book reading settings")
text(seq(0.7, 9.1, 1.2), pcnt04 + 1, cex = .8, labels = pcnt04)
text(seq(0.7, 9.1, 1.2), -.5, labels = c(paste("N(slots):\n", nchi04[1], sep = ""),
  nchi04[2: length(nchi04)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                      LW
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepctlw02.png") # save plot
# tabulate data
inttb05 <- table(chintlw$absint, chintlw$agecat)
nchi05 <- ftable(chintlw$spk, chintlw$agecat)
nchi05 <- colSums(nchi05)
# calculate percentages of intensified slots
pcnt05 <- round(inttb05[2,]/(inttb05[1,]+inttb05[2,])*100, 2)
pcnt05 <- pcnt05[is.na(pcnt05) == F]
# plot data
barplot(pcnt05, col = "grey80", ylim = c(-1, 10), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers\nagainst age in letter writing settings")
text(seq(0.7, 9.1, 1.2), pcnt05 + 1, cex = .8, labels = pcnt05)
text(seq(0.7, 9.1, 1.2), -.5, labels = c(paste("N(slots):\n", nchi05[4], sep = ""),
  nchi05[5:length(nchi05)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                    AGE STYP
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagestyppct.png",
  width = 800, height = 480) # save plot
# tabulate data
inttb06 <- ftable(chint$styp, chint$absint, chint$agecat)
nchi06 <- ftable(chint$styp, chint$spk, chint$agecat)
nchi06 <- colSums(nchi06)
# calculate percentages of intensified slots
pcnt06br <- round(inttb06[2,]/(inttb06[1,]+inttb06[2,])*100, 1)
pcnt06et <- round(inttb06[4,]/(inttb06[3,]+inttb06[4,])*100, 1)
pcnt06lw <- round(inttb06[6,]/(inttb06[5,]+inttb06[6,])*100, 1)
pcnt06mt <- round(inttb06[8,]/(inttb06[7,]+inttb06[8,])*100, 1)
pcnt06tp <- round(inttb06[10,]/(inttb06[9,]+inttb06[10,])*100, 1)
pcnt06 <- rbind(pcnt06br, pcnt06et, pcnt06lw, pcnt06mt, pcnt06tp)
colnames(pcnt06) <- 3:10
# plot data
barplot(pcnt06, col = c("red", "grey 40", "blue", "green", "grey80"),
  ylim = c(-2, 20), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of child", axes = T, beside = T,
  main = "Children's use of intensifiers by situation type")
text(seq(1.5, 43.5, 6), pcnt06[1,] + 1, cex = .8, labels = pcnt06[1,])
text(seq(2.5, 44.5, 6), pcnt06[2,] + 1, cex = .8, labels = pcnt06[2,])
text(seq(3.5, 45.5, 6), pcnt06[3,] + 1, cex = .8, labels = pcnt06[3,])
text(seq(4.5, 46.5, 6), pcnt06[4,] + 1, cex = .8, labels = pcnt06[4,])
text(seq(5.5, 47.5, 6), pcnt06[5,] + 1, cex = .8, labels = pcnt06[5,])
grid()
legend("topleft", inset = .05, c("book reading (br)", "experimental task (et)",
  "letter writing (lw)", "meal time (mt)", "toy play (tp)"),
  horiz = F,  fill = c("red", "grey 40", "blue", "green", "grey80"))
box()
dev.off()
###############################################################
###                    MOT STYP
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intmotstyppct.png",
  width = 480, height = 480) # save plot
# tabulate data
inttb07 <- ftable(motint$styp, motint$absint)
nmot07 <- ftable(motint$styp, motint$spk)
nmot07 <- colSums(nmot07)
# calculate percentages of intensified slots
pcnt07 <- cbind(inttb07, round(inttb07[,2]/(inttb07[,1]+inttb07[,2])*100, 1))
# plot data
barplot(pcnt07[,3], col = c("red", "grey 40", "blue", "green", "grey80"),  ylim = c(0, 12), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of Child", axes = T, beside = T,
  main = "Children's use of intensifiers by situation type")
text(seq(0.7, 9.1, 1.2), pcnt07[,3] + 1, cex = .8, labels = pcnt07[,3])
grid()
legend("topright", inset = .05, c("book reading (br)", "experimental task (et)",
  "letter writing (lw)", "meal time (mt)", "toy play (tp)"),
  horiz = F,  fill = c("red", "grey 40", "blue", "green", "grey80"))
box()
dev.off()
###############################################################
###                     TTR: CHI INT
###############################################################
# plot type-token-ratio of intensifiers against age
chintint <- chint[chint$absint == 1, ]
inttb08 <- table(chintint$prelex, chintint$agecat)
# calculate token freq
tokf08 <- colSums(inttb08)
# calculate type freq
typf08 <- t(apply(inttb08, 1, function(x) x <- ifelse(x > 0, 1, 0)))
typf08 <- colSums(typf08)
# calculate type-token-ratio
chittr08 <- data.frame(3:10, round(typf08/tokf08, 2), typf08, tokf08)
rownames(chittr08) <- 3:10
colnames(chittr08) <- c("Age", "TTR", "Types", "Tokens")
chittr08 <- chittr08[chittr08$Tokens > 1,]
# plot data
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/ttrchiageint.png") # save plot
plot(chittr08[,1:2], xlab = "Age", ylab = "Type-Token-Ratio (TTR): Intensifiers",
  ylim = c(0,1), pch = 19,
  main = "Children's TTRs of intensifiers by age")
abline(lm(chittr08[,2]~chittr08[,1]), col="red") # regression line (y~x)
lws08 <- unlist(lowess(chittr08[,2],chittr08[,1]))
lines(lws08[1:length(lws08)/2], col="blue")
grid()
dev.off()
###############################################################
###                     TTR: CHI ADJ
###############################################################
# plot type-token-ratio of intensifiers against age
chintint <- chint[chint$absint == 1, ]
inttb09 <- table(chintint$tokenlex, chintint$agecat)
# calculate token freq
tokf09 <- colSums(inttb09)
# calculate type freq
typf09 <- t(apply(inttb09, 1, function(x) x <- ifelse(x > 0, 1, 0)))
typf09 <- colSums(typf09)
# calculate type-token-ratio
chittr09 <- data.frame(3:10, round(typf09/tokf09, 2), typf09, tokf09)
rownames(chittr09) <- 3:10
colnames(chittr09) <- c("Age", "TTR", "Types", "Tokens")
chittr09 <- chittr09[chittr09$Tokens > 1,]
# plot data
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/ttrchiageadj.png") # save plot
plot(chittr09[,1:2], xlab = "Age", ylab = "Type-Token-Ratio (TTR): Adjectives",
  ylim = c(0,1), pch = 19,
  main = "Children's TTRs of intensified adjectives by age")
abline(lm(chittr09[,2]~chittr09[,1]), col="red") # regression line (y~x)
lws09 <- unlist(lowess(chittr09[,2],chittr09[,1]))
lines(lws09[1:length(lws09)/2], col="blue")
grid()
dev.off()
###############################################################
###                     MOT INT FREQ
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagepctmot.png") # save plot
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint10 <- join(motint, ag, by = "file", type = "left", match = "all")
motint10 <- motint10[, c(1:13, 15)]
motint10 <- motint10
# tabulate data
inttb10 <- table(motint10$absint, motint10$agecat)
nmot10 <- ftable(motint10$spk, motint10$agecat)
nmot10 <- colSums(nmot10)
# calculate percentages of intensified slots
pcnt10 <- round(inttb10[2,]/(inttb10[1,]+inttb10[2,])*100, 2)
# plot data
barplot(pcnt10, col = "grey80", ylim = c(-2, 20), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of Child", axes = T, beside = T,
  main = "Mothers' use of intensifiers\nagainst the age of her child")
text(seq(0.7, 9.1, 1.2), pcnt10 + 1, cex = .8, labels = pcnt10)
text(seq(0.7, 9.1, 1.2), -1, labels = c(paste("N(slots):\n", nmot10[1], sep = ""),
  nmot10[2: length(nmot10)]), cex = .8)
tbmotabsint <- table(motint10$absint)
m10 <- round(tbmotabsint[2]/(tbmotabsint[1]+tbmotabsint[2])*100, 2)
lines(0:10, rep(m10, length(0:10)), col="red", lty = 2, lwd = 1)
text(8.2, m10 + 1, cex = .8, paste("mean = ", m10, collapse = ""), col = "red")
grid()
box()
dev.off()
###############################################################
###                     MOT INT MT FREQ
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagemtpctmot.png") # save plot
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint11 <- join(motint, ag, by = "file", type = "left", match = "all")
motint11 <- motint11[, c(1:13, 15)]
motint11 <- motint11[motint11$styp == "mt", ]
# tabulate data
inttb11 <- table(motint11$absint, motint11$agecat)
nmot11 <- ftable(motint11$spk, motint11$agecat)
nmot11 <- colSums(nmot11)
# calculate percentages of intensified slots
pcnt11 <- round(inttb11[2,]/(inttb11[1,]+inttb11[2,])*100, 2)
# plot data
barplot(pcnt11, col = "grey80", ylim = c(-2, 20), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of Child", axes = T, beside = T,
  main = "Mother's Use of intensifiers during mean time")
text(seq(0.7, 9.1, 1.2), pcnt11 + 1, cex = .8, labels = pcnt11)
text(seq(0.7, 9.1, 1.2), -1, labels = c(paste("N(slots):\n", nmot11[1], sep = ""),
  nmot11[2: length(nmot11)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                     MOT INT BR FREQ
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagebrpctmot.png") # save plot
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint12 <- join(motint, ag, by = "file", type = "left", match = "all")
motint12 <- motint12[, c(1:13, 15)]
motint12 <- motint12[motint12$styp == "br", ]
# tabulate data
inttb12 <- table(motint12$absint, motint12$agecat)
nmot12 <- ftable(motint12$spk, motint12$agecat)
nmot12 <- colSums(nmot12)
# calculate percentages of intensified slots
pcnt12 <- round(inttb12[2,]/(inttb12[1,]+inttb12[2,])*100, 2)
pcnt12 <- pcnt12[is.na(pcnt12) == F]
# plot data
x <- barplot(pcnt12, col = "grey80", ylim = c(-2, 20), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of Child", axes = T, beside = T,
  main = "Mother's Use of intensifiers during book reading")
m12 <- round(sum(inttb12[2,])/(sum(inttb12[1,]) + sum(inttb12[2,]))*100, 2)
lines(0:10, rep(m12, length(0:10)), lty = 2, col = "red", lwd = 1)
text(4.3, m12+0.5, paste("mean = ", m12, collapse = ""), cex = .8, col = "red")
text(seq(0.7, 9.1, 1.2), pcnt12 + 1, cex = .8, labels = pcnt12)
text(seq(0.7, 9.1, 1.2), -1, labels = c(paste("N(slots):\n", nmot12[1], sep = ""),
  nmot12[2: length(nmot12)]), cex = .8)
grid()
box()
dev.off()
###############################################################
###                     MOT INT LW FREQ
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagelwpctmot.png") # save plot
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint13 <- join(motint, ag, by = "file", type = "left", match = "all")
motint13 <- motint13[, c(1:13, 15)]
motint13 <- motint13[motint13$styp == "lw", ]
# tabulate data
inttb13 <- table(motint13$absint, motint13$agecat)
nmot13 <- ftable(motint13$spk, motint13$agecat)
nmot13 <- colSums(nmot13)
# calculate percentages of intensified slots
pcnt13 <- round(inttb13[2,]/(inttb13[1,]+inttb13[2,])*100, 2)
pcnt13 <- pcnt13[is.na(pcnt13) == F]
# plot data
barplot(pcnt13, col = "grey80", ylim = c(-2, 20), ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of Child", axes = T, beside = T,
  main = "Mother's Use of intensifiers during letter writing")
text(seq(0.7, 11.5, 1.2), pcnt13 + 1, cex = .8, labels = pcnt13)
text(seq(0.7, 11.5, 1.2), -1, labels = c(paste("N(slots):\n", nmot13[4], sep = ""),
  nmot13[5:length(nmot13)]), cex = .8)
m13 <- round(sum(inttb13[2,])/(sum(inttb13[1,]) + sum(inttb13[2,]))*100, 2)
lines(0:12, rep(m13, length(0:12)), lty = 2, col = "red", lwd = 1)
text(4.3, m13+0.5, paste("mean = ", m13, collapse = ""), cex = .8, col = "red")
grid()
box()
dev.off()
###############################################################
###
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagestyppctmot.png",
  width = 900, height = 480)
# tabulate data
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint14 <- join(motint, ag, by = "file", type = "left", match = "all")
motint14 <- motint14[, c(1:13, 15)]
inttb14 <- ftable(motint14$styp, motint14$absint, motint14$agecat)
nmot14 <- ftable(motint14$styp, motint14$spk, motint14$agecat)
nmot14 <- colSums(nmot14)
# calculate percentages of intensified slots
pcnt14br <- round(inttb14[2,]/(inttb14[1,]+inttb14[2,])*100, 1)
pcnt14et <- round(inttb14[4,]/(inttb14[3,]+inttb14[4,])*100, 1)
pcnt14lw <- round(inttb14[6,]/(inttb14[5,]+inttb14[6,])*100, 1)
pcnt14mt <- round(inttb14[8,]/(inttb14[7,]+inttb14[8,])*100, 1)
pcnt14tp <- round(inttb14[10,]/(inttb14[9,]+inttb14[10,])*100, 1)
pcnt14 <- rbind(pcnt14br, pcnt14et, pcnt14lw, pcnt14mt, pcnt14tp)
colnames(pcnt14) <- 3:10
# plot data
x <- barplot(pcnt14, col = c("red", "grey 40", "blue", "green", "grey80"), ylim = c(-2, 30),
  ylab = "Percent (intensified slots of all slots)",
  xlab = "Age of motld", axes = T, beside = T,
  main = "Mother's Use of intensifiers by situation type")
text(seq(1.5, 43.5, 6), pcnt14[1,] + 1, cex = .8, labels = pcnt14[1,])
text(seq(2.5, 44.5, 6), pcnt14[2,] + 1, cex = .8, labels = pcnt14[2,])
text(seq(3.5, 45.5, 6), pcnt14[3,] + 1, cex = .8, labels = pcnt14[3,])
text(seq(4.5, 46.5, 6), pcnt14[4,] + 1, cex = .8, labels = pcnt14[4,])
text(seq(5.5, 47.5, 6), pcnt14[5,] + 1, cex = .8, labels = pcnt14[5,])
legend("topleft", inset = .05, c("book reading (br)", "experimental task (et)",
  "letter writing (lw)", "meal time (mt)", "toy play (tp)"), horiz = F,
  fill = c("red", "grey 40", "blue", "green", "grey80"))
grid()
box()
dev.off()
###############################################################
###                       MOT TTR INT
###############################################################
# plot type-token-ratio of intensifiers against age
motint15 <- motint[motint$absint == 1, ]
# tabulate data
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motint15 <- join(motint15, ag, by = "file", type = "left", match = "all")
motint15 <- motint15[, c(1:13, 15)]
inttb15 <- table(motint15$prelex, motint15$agecat)
# calculate token freq
tokf15 <- colSums(inttb15)
# calculate type freq
typf15 <- t(apply(inttb15, 1, function(x) x <- ifelse(x > 0, 1, 0)))
typf15 <- colSums(typf15)
# calculate type-token-ratio
motttr15 <- data.frame(3:10, round(typf15/tokf15, 2), typf15, tokf15)
motttr15 <- na.omit(motttr15)
colnames(motttr15) <- c("Age", "TTR", "Types", "Tokens")
# plot data
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/ttrmotageint.png") # save plot
plot(motttr15[, 1:2], xlab = "Age of Child", ylab = "Type-Token-Ratio (TTR): Intensifiers",
  ylim = c(0,1), pch = 19,
  main = "Mother's TTRs of intensifiers\nby age of child")
abline(lm(motttr15[,2]~motttr15[,1]), col="red") # regression line (y~x)
lws15 <- unlist(lowess(motttr15[,2],motttr15[,1]))
lines(lws15[1:length(lws15)/2], col="blue")
grid()
dev.off()
###############################################################
###                       MOT TTR ADJ
###############################################################
motadj16 <- motint[motint$absint == 1, ]
# tabulate data
ag <- chint[,c(1, 14)]
ag <- unique(ag)
motadj16 <- join(motadj16, ag, by = "file", type = "left", match = "all")
motadj16 <- motadj16[, c(1:13, 15)]
adjtb16 <- table(motadj16$tokenlex, motadj16$agecat)
# calculate token freq
tokf16 <- colSums(adjtb16)
# calculate type freq
typf16 <- t(apply(adjtb16, 1, function(x) x <- ifelse(x > 0, 1, 0)))
typf16 <- colSums(typf16)
# calculate type-token-ratio
motttr16 <- data.frame(3:10, round(typf16/tokf16, 2), typf16, tokf16)
motttr16 <- na.omit(motttr16)
colnames(motttr16) <- c("Age", "TTR", "Types", "Tokens")
# plot data
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/ttrmotageadj.png") # save plot
plot(motttr16[, 1:2], xlab = "Age of Child", ylab = "Type-Token-Ratio (TTR): Intensifiers",
  ylim = c(0,1),pch = 19,
  main = "Mother's Type-Token-Ratios\nof intensified adjectives against the age of her child")
abline(lm(motttr16[,2]~motttr16[,1]), col="red") # regression line (y~x)
lws16 <- unlist(lowess(motttr16[,2],motttr16[,1]))
lines(lws16[1:length(lws16)/2], col="blue")
grid()
dev.off()
###############################################################
###############################################################
###############################################################
###                         SBC
###############################################################
###############################################################
###############################################################
# read in data
biosbc <- read.table("C:\\MeineHomepage\\docs\\data/BiodataSbcae.txt", sep = "\t", header=TRUE)
# set parameters
context <- 80
all.pre = T
# define search strings
search.pattern <-  c("[A-Z]{0,1}[a-z]{0,}-{0,1}[a-z]{2,}\\/JJ")
##########################################################
# start searches
sbcint <- ConcR("C:\\Corpora\\edited\\SBCPosTagged", search.pattern, context, all.pre = T)
# retrieve speaker
sbcint$all.pre <- gsub("([A-Z]{2,20}[0-9{0,1}]:)", "qwertz\\1", sbcint$all.pre)
sbcint$all.pre <- gsub(".*qwertz", "", sbcint$all.pre)
sbcint$all.pre <- gsub(":.*", "", sbcint$all.pre)
sbcint$all.pre <- tolower(sbcint$all.pre)
# clean column names
colnames(sbcint)[5] <- "spk"
colnames(biosbc)[3] <-  "file"
colnames(biosbc)[4] <-  "spk"
# clean file name
sbcint$file <- gsub(".txt", "", sbcint$file, fixed = T)
sbcint$file <- tolower(sbcint$file)
# join data sets
sbc <- join(sbcint, biosbc, by = c("file", "spk"), type = "left")
# select only female speakers
sbc <- sbc[sbc$gender == "female",]
# code status
sbc$status <- as.vector(unlist(sapply(sbc$post, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(" .*", "", x)
  x <- gsub(".*\\/", "", x)
  x <- gsub("NNS", "NN", x)
  x <- ifelse(x == "NN", "attr", "pred") } )))
# code previous element (pos)
sbc$prepos <- as.vector(unlist(sapply(sbc$pre, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(".*\\/", "", x)
  x <- gsub(" .*", "", x) } )))
# code previous element (lexeme)
sbc$prelex <- as.vector(unlist(sapply(sbc$pre, function(x){
  x <- str_trim(x, side = "both")
  x <- gsub(".* ", "", x)
  x <- gsub("\\/.*", "", x) } )))
# extract all attributive adjectives
sbc <- sbc[sbc$status == "attr",]
# code intensifiers
sbc$prelex <- tolower(sbc$prelex)
sbc$absint <- as.vector(unlist(sapply(sbc$prelex, function(x){
  x <- ifelse(x == "actually", 1,
    ifelse(x == "crass", 1,
    ifelse(x == "damn", 1,
    ifelse(x == "evidently", 1,
    ifelse(x == "particularly", 1,
    ifelse(x == "pretty", 1,
    ifelse(x == "real", 1,
    ifelse(x == "really", 1,
    ifelse(x == "right", 1,
    ifelse(x == "so", 1,
    ifelse(x == "too", 1,
    ifelse(x == "very", 1, 0)))))))))))) } )))
# select only women between 20 and 45
sbc <- sbc[sbc$age.exact >= 20, ]
sbc <- sbc[sbc$age.exact <= 45, ]
# inspect data
head(sbc)

##########################################################
###                     FREQ INT ALL
##########################################################
# plot frequency of intensification by child, mothers, and female sbc speakers
# tabulate data
cmp <- cbind(table(chint$absint), table(motint$absint), table(sbc$absint))
cmp <- rbind(cmp, round(cmp[2,]/(cmp[1,]+cmp[2,])*100,2))
colnames(cmp) <- c("Child", "Mother (CDS)", "Female (SBC)")
# save plot
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/cmp-all.png") # save plot
# plot data
barplot(cmp[3,], col = c("grey 40", "grey 60", "grey80"), ylim = c(0, 8),
  ylab = "Percent (intensified slots of all slots)",
  xlab = "", axes = T, beside = T,
  main = "Intensifier use across corpora\nin informal settings")
text(seq(0.7, 3.1, 1.2), cmp[3,] + .5, cex = .8, labels = cmp[3,])
legend("topleft", inset = .05, c("Children", "Mothers (CDS)", "Female SBC speakers"),
  horiz = F,  fill = c("grey 40", "grey 60", "grey80"))
grid()
box()
dev.off()
##########################################################
###                     FREQ INT MT
##########################################################
# plot frequency of intensification by child, mothers, and female sbc speakers
# tabulate data
chintmt <- chint[chint$styp == "mt",]
motintmt <- motint[motint$styp == "mt", ]
cmp <- cbind(table(chintmt$absint), table(motintmt$absint), table(sbc$absint))
cmp <- rbind(cmp, round(cmp[2,]/(cmp[1,]+cmp[2,])*100,2))
colnames(cmp) <- c("Child", "Mother (CDS)", "Female (SBC)")
# save plot
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/cmp-mt.png") # save plot
# plot data
barplot(cmp[3,], col = c("grey 40", "grey 60", "grey80"), ylim = c(0, 8),
  ylab = "Percent (intensified slots of all slots)",
  xlab = "", axes = T, beside = T,
  main = "Intensifier use across corpora\nin informal settings")
text(seq(0.7, 3.1, 1.2), cmp[3,] + .5, cex = .8, labels = cmp[3,])
legend("topleft", inset = .05, c("Children", "Mothers (CDS)", "Female SBC speakers"),
  horiz = F,  fill = c("grey 40", "grey 60", "grey80"))
grid()
box()
dev.off()
###############################################################
###############################################################
###############################################################
### --- tabulate data
###############################################################
###############################################################
###############################################################
# create data for tabularizaion
wcmot <- by(motint$wc, list(motint$file), unique)
motwc <- as.vector(unlist(wcmot))
motfile <- as.vector(unlist(names(wcmot)))
mdat <- data.frame(1:length(motfile), motfile, motwc)
motadj <- as.vector(unlist(table(motint$file)))
motabsint <- as.vector(unlist(by(motint$absint, list(motint$file), sum)))
motrat <- round(motabsint/motadj, 2)
###
motint$awful <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "awful", 1, 0) } )))
motint$awfully <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "awfully", 1, 0) } )))
motint$pretty <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "pretty", 1, 0) } )))
motint$real <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "real", 1, 0) } )))
motint$really <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "really", 1, 0) } )))
motint$so <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "so", 1, 0) } )))
motint$too <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "too", 1, 0) } )))
motint$very <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "very", 1, 0) } )))
motint$wonderfully <- as.vector(unlist(sapply(motint$prelex, function(x){ x <- ifelse(x == "wonderfully", 1, 0) } )))
###
motawful <- as.vector(unlist(by(motint$awful, list(motint$file), sum)))
motawfully <- as.vector(unlist(by(motint$awfully, list(motint$file), sum)))
motpretty <- as.vector(unlist(by(motint$pretty, list(motint$file), sum)))
motreal <- as.vector(unlist(by(motint$real, list(motint$file), sum)))
motreally <- as.vector(unlist(by(motint$really, list(motint$file), sum)))
motso <- as.vector(unlist(by(motint$so, list(motint$file), sum)))
mottoo <- as.vector(unlist(by(motint$too, list(motint$file), sum)))
motvery <- as.vector(unlist(by(motint$very, list(motint$file), sum)))
motwonderfully <- as.vector(unlist(by(motint$wonderfully, list(motint$file), sum)))
# creat data frame
mdat <- data.frame(1:length(motfile), motfile, motwc, motabsint, motadj, motrat, motawful,
  motawfully, motpretty, motreal, motreally, motso, mottoo, motvery, motwonderfully)
# clean column names
colnames(mdat)[1:2] <- c("id", "file")
# inspect data
head(mdat)

###############################################################
# join child and mother data
data2 <- join(chint, mdat, by = "file", type = "left", match = "all")
# add child column
data2$chi <- gsub(".*\\/", "", data2$file)
data2$chi <- gsub("[0-9]", "", data2$chi)
data2$chi <- gsub("[a-z]{2,2}$", "", data2$chi)
# inspect data
head(data2)

###############################################################
# extract file based data
filechiwcagesex <- data.frame(data2$file, data2$wc, data2$agecat, data2$gender)
chiwcagesex <- unique(filechiwcagesex)
colnames(chiwcagesex) <- c("file", "chiwc", "age", "sex")
###
filemotwc <- data.frame(data2$file, data2$motwc, data2$motabsint)
motwc <- unique(filemotwc)
colnames(motwc) <- c("file", "motwc", "motabsint")
###
absint <- as.vector(unlist(by(data2$absint, list(data2$file), sum)))
fileabsint <- names(by(data2$absint, list(data2$file), sum))
fileabsint <- data.frame(fileabsint, absint)
colnames(fileabsint) <- c("file", "absint")
# combine information
data3 <- join(chiwcagesex, motwc, by = "file", type = "left", match = "all")
data4 <- join(data3, fileabsint, by = "file", type = "left", match = "all")
# add child column
data4$chi <- gsub(".*\\/", "", data4$file)
data4$chi <- gsub("[0-9]", "", data4$chi)
data4$chi <- gsub("[a-z]{2,2}$", "", data4$chi)
# add styp column
data4$styp <- gsub("HV[0-9]\\/", "", data4$file)
data4$styp <- gsub("\\/.*", "", data4$styp)
data4$styp <- tolower(data4$styp)
# remove rows with incomplete information
data4 <- na.omit(data4)
# inspect data
head(data4)

write.table(data4, "C:\\03-MyProjects\\07IntensifierAcquisition/data4.txt", row.names = F, sep="\t")
###############################################################
# create agecat based data set
info <- by(data4$chi, list(data4$styp, data4$sex, data4$age), length)
nspk <-  as.vector(unlist(by(data4$chi, list(data4$styp, data4$sex, data4$age), length)))
chiwc <- as.vector(unlist(by(data4$chiwc, list(data4$styp, data4$sex, data4$age), sum)))
absint <- as.vector(unlist(by(data4$absint, list(data4$styp, data4$sex, data4$age), sum)))
motabsint <- as.vector(unlist(by(data4$motabsint, list(data4$styp, data4$sex, data4$age), sum)))
motwc <-  as.vector(unlist(by(data4$motwc, list(data4$styp, data4$sex, data4$age), sum)))
chiptw <- round(absint/chiwc*1000, 2)
motptw <- round(motabsint/motwc*1000, 2)
###
styp <- rep(c("br", "et", "lw", "mt", "tp"), 16)
sex <- rep(c(rep("female", 5), rep("male", 5)), 8)
age <- c(rep(3, 10), rep(4, 10), rep(5, 10),rep(6, 10),rep(7, 10),rep(8, 10),rep(9, 10), rep(10, 10))
#combine data to table
df1 <- data.frame(age, sex, styp, nspk, absint, chiwc, chiptw, motabsint, motwc, motptw)
# remove incolplete rows
df1 <- na.omit(df1)
# inspect results
head(df1)

write.table(df1, "C:\\03-MyProjects\\07IntensifierAcquisition/df1.txt", row.names = F, sep="\t")
###############################################################
# create agecat based data set
info2 <- by(data4$chi, list(data4$sex, data4$age), length)
nspk2 <-  by(data4$chi, list(data4$sex, data4$age), table)
nspk2 <- sapply(nspk2, function(x){ x <- length(x) } )
chiwc2 <- as.vector(unlist(by(data4$chiwc, list(data4$sex, data4$age), sum)))
absint2 <- as.vector(unlist(by(data4$absint, list(data4$sex, data4$age), sum)))
motabsint2 <- as.vector(unlist(by(data4$motabsint, list(data4$sex, data4$age), sum)))
motwc2 <-  as.vector(unlist(by(data4$motwc, list(data4$sex, data4$age), sum)))
chiptw2 <- round(absint2/chiwc2*1000, 2)
motptw2 <- round(motabsint2/motwc2*1000, 2)
###
sex2 <- rep(c("female", "male"), 8)
age2 <- c(rep(3, 2), rep(4, 2), rep(5, 2),rep(6, 2),rep(7, 2),rep(8, 2),rep(9, 2), rep(10, 2))
#combine data to table
df2 <- data.frame(age2, sex2, nspk2, absint2, chiwc2, chiptw2, motabsint2, motwc2, motptw2)
# remove incolplete rows
sumspk2 <- as.vector(unlist(length(table(data4$chi))))
sumabsint2 <- as.vector(unlist(sum(data4$absint)))
sumchiwc2 <- sum(chiwc2)
sumchiptw2 <- round(sumabsint2/sumchiwc2*1000, 2)
summotabsint2 <- sum(motabsint2)
summotwc2 <- sum(motwc2)
summotptw2 <- round(summotabsint2/summotwc2*1000, 2)
total <- c("", "", sumspk2, sumabsint2, sumchiwc2, sumchiptw2, summotabsint2, summotwc2, summotptw2)
# add to table
df2 <- rbind(df2, total)
# inspect results
head(df2)

write.table(df2, "C:\\03-MyProjects\\07IntensifierAcquisition/df2.txt", row.names = F, sep="\t")
###############################################################
###############################################################
###############################################################
# create agecat based data set
nspk0 <-  length(names(table(data4$chi)))
chiwc0 <- as.vector(unlist(sum(data4$chiwc)))
absint0 <- as.vector(unlist(sum(data4$absint)))
motabsint0 <- as.vector(unlist(sum(data4$motabsint)))
motwc0 <-  as.vector(unlist(sum(data4$motwc)))
chiptw0 <- round(absint0/chiwc0*1000, 2)
motptw0 <- round(motabsint0/motwc0*1000, 2)
#combine data to table
df0 <- cbind(nspk0, absint0, chiwc0, chiptw0)
df0 <- rbind(df0, c("", motabsint0, motwc0, motptw0))
# add row and column names
colnames(df0) <- c("Speakers (N)", "Intensifiers (N)", "Words (N)", "Rel. Freq. (ptw)")
rownames(df0) <- c("Children", "Mothers")
# inspect results
head(df0)

write.table(df0, "C:\\03-MyProjects\\07IntensifierAcquisition/df0.txt", row.names = F, sep="\t")
###############################################################
# add ptw to data4
data4$intptw <- round(data4$absint/data4$chiwc*1000, 2)
# plot genders (scatterplot)
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagegenderptw.png") # save plot
scatterplot(intptw ~ as.numeric(age)|sex, data=data4, reg.line = FALSE, xlab = "Age",
  ylab = "Rel. Frequency", axes = F)
axis(1, at= 1:8, labels = 3:10)
axis(2, at = seq(0, 25, 5), labels = seq(0, 25, 5))
grid()
dev.off()
# plot styp (scatterplot)
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intagestypptw.png") # save plot
scatterplot(intptw ~ as.numeric(age)|styp, data=data4, reg.line = FALSE, xlab = "Age",
  ylab = "Rel. Frequency", axes = F)
axis(1, at= 1:8, labels = 3:10)
axis(2, at = seq(0, 25, 5), labels = seq(0, 25, 5))
grid()
dev.off()
###############################################################
###############################################################
###############################################################
# analogous plot in ggplot2
agemeans <- as.vector(by(data4$intptw, data4$age, mean))
agemeans <- agemeans[complete.cases(agemeans)]
p0 <- ggplot(data4, aes(age, intptw, color = age)) +
  scale_fill_brewer() +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "line") +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  theme_set(theme_bw(base_size = 20)) +
  coord_cartesian(ylim = c(0, 5)) +
  labs(x = "Age", y = "Rel. Freq. (per 1,000 words)", colour = "age", cex = .75) +
  scale_color_manual(values = c("grey20", "grey25", "grey30", "grey35", "grey40", "grey45", "grey50", "grey55", "grey60")) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[1], 2), sep = ""), x = 1, y = agemeans[1] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[2], 2), sep = ""), x = 2, y = agemeans[2] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[3], 2), sep = ""), x = 3, y = agemeans[3] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[4], 2), sep = ""), x = 4, y = agemeans[4] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[5], 2), sep = ""), x = 5, y = agemeans[5] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[6], 2), sep = ""), x = 6, y = agemeans[6] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[7], 2), sep = ""), x = 7, y = agemeans[7] + 2, colour = "grey20", size = 5) +
  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[8], 2), sep = ""), x = 8, y = agemeans[8] + 2, colour = "grey20", size = 5) +
#  geom_text(mapping = NULL, label = paste("mean=\n", round(agemeans[9], 2), sep = ""), x = 9, y = agemeans[9] + 2, colour = "grey20", size = 5) +
  theme(legend.position="none")
# activate (remove #) to save
imageFile <- paste(imageDirectory,"intageggplot.png",sep="/")
ggsave(file = imageFile)
# activate (remove #) to show
p0

###
p1 <- ggplot(data4, aes(age, intptw, colour = sex)) +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "point", aes(group = sex)) +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  facet_wrap(~ sex, nrow = 1) +
  coord_cartesian(ylim = c(0, 7.5)) +
  theme_set(theme_bw(base_size = 20)) +
  labs(x = "", y = "Frequency (Mean: per 1,000 words)", colour = "sex") +
  theme(legend.position="none") +
  scale_color_manual(values = c("red", "lightblue"))
# activate (remove #) to save
imageFile <- paste(imageDirectory,"intAgeSexggplot.png",sep="/")
ggsave(file = imageFile)
# activate (remove #) to show
p1

###
p2 <- ggplot(data4, aes(age, intptw, colour = styp)) +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "point", aes(group = styp)) +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  facet_wrap(~ styp, nrow = 1) +
  coord_cartesian(ylim = c(0, 7.5)) +
  theme_set(theme_bw(base_size = 20)) +
  labs(x = "", y = "Frequency (Mean: per 1,000 words)", colour = "styp") +
  theme(legend.position="none") +
  scale_color_manual(values = c("grey30", "grey30", "grey30", "grey30", "grey30", "grey30", "grey30"))
# activate (remove #) to save
imageFile <- paste(imageDirectory,"intAgeStypggplot.png",sep="/")
ggsave(file = imageFile)
# activate (remove #) to show
p2

###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intAgeGenderStyp.png") # save plot
par(mfrow=c(3,2))
brp3 <- data[data$styp == "br", ]
etp3 <- data[data$styp == "et", ]
lwp3 <- data[data$styp == "lw", ]
mtp3 <- data[data$styp == "mt", ]
tpp3 <- data[data$styp == "tp", ]
# brp3
absintsexagebr <- by(brp3$absint, list(brp3$agecat, brp3$gender), mean)
female <- round(as.vector(absintsexagebr)[1:8], 2)
male <- round(as.vector(absintsexagebr)[9:16], 2)
plot(female, type = "o", lwd = 2, lty = 1, pch = 19, ylim = c(0, .2), main = "book reading",
  ylab = "Relative Frequency", xlab = "Age", axes = F, cex = 1, col = "red")
lines(male, type = "o", lwd = 2,  lty = 2, pch = 0,  cex = 1, col = "blue")
# add x-axes with specified labels at specified intervals
axis(1, at = 1:8, lab = 3:10, cex = 2)
axis(2, at = seq(0, .2, .05), las = 1, lab = seq(0, .2, .05))
linetype <- c(1, 2)
plotchar <- c(19, 0)
legend("topleft", inset = .05, c("female", "male"),
  horiz = F,  pch = plotchar, lty = linetype, col = c("red", "blue"))
box()
grid()
# etp3
absintsexageet <- by(etp3$absint, list(etp3$agecat, etp3$gender), mean)
female <- round(as.vector(absintsexageet)[1:8], 2)
male <- round(as.vector(absintsexageet)[9:16], 2)
plot(female, type = "o", lwd = 2, lty = 1, pch = 19, ylim = c(0, .2), main = "experimental task",
  ylab = "", xlab = "Age", axes = F, cex = 1, col = "red")
lines(male, type = "o", lwd = 2,  lty = 2, pch = 0,  cex = 1, col = "blue")
# add x-axes with specified labels at specified intervals
axis(1, at = 1:8, lab = 3:10)
axis(2, at = seq(0, .2, .05), las = 1, lab = seq(0, .2, .05))
linetype <- c(1, 2)
plotchar <- c(19, 0)
legend("topleft", inset = .05, c("female", "male"),
  horiz = F,  pch = plotchar, lty = linetype, col = c("red", "blue"))
box()
grid()
# lwp3
absintagelw <- by(lwp3$absint, list(lwp3$agecat, lwp3$gender), mean)
female <- round(as.vector(absintagelw)[1:8], 2)
male <- round(as.vector(absintagelw)[9:16], 2)
plot(female, type = "o", lwd = 2, lty = 1, pch = 19, ylim = c(0, .2), main = "letter writing",
  ylab = "Relative Frequency", xlab = "Age", axes = F, cex = 1, col = "red")
lines(male, type = "o", lwd = 2,  lty = 2, pch = 0,  cex = 1, col = "blue")
# add x-axes with specified labels at specified intervals
axis(1, at = 1:8, lab = 3:10)
axis(2, at = seq(0, .2, .05), las = 1, lab = seq(0, .2, .05))
linetype <- c(1, 2)
plotchar <- c(19, 0)
legend("topleft", inset = .05, c("female", "male"),
  horiz = F,  pch = plotchar, lty = linetype, col = c("red", "blue"))
box()
grid()
# mtp3
absintsexragemt <- by(mtp3$absint, list(mtp3$agecat, mtp3$gender), mean)
female <- round(as.vector(absintsexragemt)[1:8], 2)
male <- round(as.vector(absintsexragemt)[9:16], 2)
plot(female, type = "o", lwd = 2, lty = 1, pch = 19, ylim = c(0, .2), main = "meal time",
  ylab = "Relative Frequency", xlab = "Age", axes = F, cex = 1, col = "red")
lines(male, type = "o", lwd = 2,  lty = 2, pch = 0,  cex = 1, col = "blue")
# add x-axes with specified labels at specified intervals
axis(1, at = 1:8, lab = 3:10)
axis(2, at = seq(0, .2, .05), las = 1, lab = seq(0, .2, .05))
linetype <- c(1, 2)
plotchar <- c(19, 0)
legend("topleft", inset = .05, c("female", "male"),
  horiz = F,  pch = plotchar, lty = linetype, col = c("red", "blue"))
box()
grid()
# tpp3
absintsexragetp <- by(tpp3$absint, list(tpp3$agecat, tpp3$gender), mean)
female <- round(as.vector(absintsexragetp)[1:8], 2)
male <- round(as.vector(absintsexragetp)[9:16], 2)
plot(female, type = "o", lwd = 2, lty = 1, pch = 19, ylim = c(0, .2), main = "toy play",
  ylab = "", xlab = "Age", axes = F, cex = 1, col = "red")
lines(male, type = "o", lwd = 2,  lty = 2, pch = 0,  cex = 1, col = "blue")
# add x-axes with specified labels at specified intervals
axis(1, at = 1:8, lab = 3:10)
axis(2, at = seq(0, .2, .05), las = 1, lab = seq(0, .2, .05))
linetype <- c(1, 2)
plotchar <- c(19, 0)
legend("topleft", inset = .05, c("female", "male"),
  horiz = F,  pch = plotchar, lty = linetype, col = c("red", "blue"))
box()
grid()
par(mfrow=c(1,1))
dev.off()
###############################################################
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intGenderStyp.png") # save plot
absintagestyp <- by(data$absint, list(data$styp, data$gender), mean)
st <- c("br", "et", "lw", "mt", "tp")
female <- absintagestyp[1:5]
male <- absintagestyp[6:10]
absintagestyptb <- rbind(female, male)
colnames(absintagestyptb) <- st
rownames(absintagestyptb) <- c("female", "male")
x <- barplot(absintagestyptb, main="", ylim = c(0, 0.2), ylab = "Probabilty of Intensification",
  xlab="Situation Type", col=c("lightgrey", "darkgrey"), beside=TRUE)
legend("topright", inset = .05, c("female", "male"), #title="Gender of child",
  horiz = F,  fill = c("lightgrey", "darkgrey"))
legend("topleft", inset=.05, title="Situation Types",
  c("br: book reading", "et: experimental task ", "lw: letter writing", "mt: meal time", "tp: toy play"),
  horiz=F)
text(c(1.5, 4.5, 7.5, 10.5, 13.5, 2.5, 5.5, 8.5, 11.5, 14.5), absintagestyp + .005, cex = 1, labels = round(absintagestyp, 2))
grid()
box()
dev.off()
###############################################################
p4 <- ggplot(data, aes(styp, absint, colour = gender)) +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "point", aes(group = gender)) +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  coord_cartesian(ylim = c(0, .2)) +
  theme_set(theme_bw(base_size = 20)) +
  labs(x = "Age", y = "Frequency (Mean: Ratio of Intensified Slots)", colour = "styp") +
  theme(legend.position="none") +
  scale_color_manual(values = c("red", "blue"))
# activate (remove #) to save
imageFile <- paste(imageDirectory,"intGenderStyp.png",sep="/")
ggsave(file = imageFile)
# activate (remove #) to show
p4

###############################################################
wcstyp <- aggregate(absint ~ styp, data=data, FUN=mean)
wcstyp <- wcstyp[order(wcstyp$absint, decreasing = T),]
wcstyp[,1] <- c("br", "et", "lw", "mt", "tpbr")
pct <- round(wcstyp[, 2]/ sum(wcstyp[, 2])*100, 2)
png("C:\\03-MyProjects\\07IntensifierAcquisition\\Article\\images/intptwstyprecoded.png") # save plot
#dev.new(width=9, height=9)
barplot(wcstyp[, 2], ylim = c(0, .2), ylab = "Ratio of Intensified Slots", xlab = "Situation Type",
  col = c("grey50", "grey65", "grey80", "grey95"))
axis(1, at=seq(0.7, 5.5, 1.2), labels=wcstyp[, 1])
legend("topright", inset=.05, title="Situation Types",
   c("book reading (br)", "experimental task (et)", "letter writing (lw)", "meal time (mt)", "toy play (tp)"),
   fill=c("grey50", "grey65", "grey80", "grey95"), horiz=F)
box()
grid()
text(seq(0.7, 5.5, 1.2), wcstyp[, 2]+.005, round(wcstyp[, 2], 2))
dev.off()
###############################################################
###############################################################
###############################################################
### --- statz
###############################################################
###############################################################
###############################################################
# remove all incomplete cases from data set
data2 <- data2[complete.cases(data2),]
################################################################
### ---        Model Building - GLMEBR
################################################################
# set options
options(contrasts  =c("contr.treatment", "contr.poly"))
data.dist <- datadist(data2)
options(datadist = "data.dist")
data2$agecat <- as.numeric(data2$agecat)
data2$agecat <- as.vector(unlist(sapply(data2$agecat, function(x){
  x <- ifelse(x == 8, 10,
    ifelse(x == 7, 9,
    ifelse(x == 6, 8,
    ifelse(x == 5, 7,
    ifelse(x == 4, 6,
    ifelse(x == 3, 5,
    ifelse(x == 2, 4,
    ifelse(x == 1, 3, x)))))))) } )))
# a few words on glm vs lrm: Baayen (2008:196-197) states that lrm should be
# the function of choice in cases where each row contains
# exactly 1 success OR failure (1 or 0) while glm is preferrable if there are two
# columns holding the number of successes and the number of failures
# respectively. i have tried it both ways and both functions work fine if
# each row contains exactly 1 success OR failure but only glm can handle the
# latter case.
# generate initial saturated regression model including
# all variables and their interactions
m0.glm = glm(absint ~ 1, family = binomial, data = data2) # baseline model glm
m0.lrm = lrm(absint ~ 1, data = data2, x = T, y = T) # baseline model lrm
# inspect results
summary(m0.glm)

m0.lrm

###########################################################################
# create model with a random intercept for childid
m0.lmer <- lmer(absint ~ (1|chi), data = data2, family = binomial) # do not use: deprecated!
# Baayen (2008:278-284) uses the call above but the this call is now longer
# up-to-date because the "family" parameter is deprecated
# we switch to glmer (suggested by R) instead but we will also
# create a lmer object of the final minimal adequate model as some functions
# will not (yet) work on glmer
m0.glmer = glmer(absint ~ (1|chi), data = data2, family = binomial)

# results of the lmer object
print(m0.lmer, corr = F)

# check if including the random effect is permitted by comparing the aic from the glm to aic from the glmer model
aic.glmer <- AIC(logLik(m0.glmer))
aic.glm <- AIC(logLik(m0.glm))
aic.glmer; aic.glm

# the aic of the glm object is smaller which shows that including the random
# intercepts is NOT justified
###
# testing whether random intercepts are justified using a intlihood ratio test
lrt <- function (obj1, obj2) {
  L0 <- logLik(obj1)
  L1 <- logLik(obj2)
  L01 <- as.vector(- 2 * (L0 - L1))
  df <- attr(L1, "df") - attr(L0, "df")
  list(L01 = L01, df = df,
  "p-value" = pchisq(L01, df, lower.tail = FALSE))
  }
lrt(obj1=m0.glm, obj2=m0.glmer)

# the test confirms that including random intercepts is NOT justified
# inspect results
summary(m0.glm)

summary(m0.glmer)

###########################################################################
# model fitting
# fit the model to find the "best" model, i.e. the minimal adequate model
# we will use a step-wise step up, i.e. forward, procedure although with step-wise step-up
# the intlyhood of type II errors is much higher than with step-wise
# step down, i.e. backward elimination(cf. Field, Miles & Field 2012:265)
#	automated model fitting
m1.glm <- glm(absint ~ agecat, family = binomial, data = data2)
m2.glm <- glm(absint ~ agecat + gender, family = binomial, data = data2)
m3.glm <- glm(absint ~ agecat + gender + motrat, family = binomial, data = data2)
m4.glm <- glm(absint ~ agecat + gender + motrat + styp, family = binomial, data = data2)
m5.glm <- glm(absint ~ agecat + gender + motrat + styp + agecat : gender, family = binomial, data = data2)
m6.glm <- glm(absint ~ agecat + gender + motrat + styp + agecat : gender + agecat : motrat, family = binomial, data = data2)
m7.glm <- glm(absint ~ agecat + gender + motrat + styp + agecat : gender + agecat : motrat + agecat : styp, family = binomial, data = data2)
# we compare all models because this way, we get an overview of model paramerets
# and can check which model has the lowerst AIC, BIC, and the highest X^2 value
anova(m0.glm, m1.glm, m2.glm, m3.glm, m4.glm, m5.glm, m6.glm, m7.glm, test = "Chi")

# use customized model comparison function
# create comparisons
m1.m0 <- anova(m1.glm, m0.glm, test = "Chi")
m2.m1 <- anova(m2.glm, m1.glm, test = "Chi")
m3.m2 <- anova(m3.glm, m2.glm, test = "Chi")
m4.m3 <- anova(m4.glm, m3.glm, test = "Chi")
m5.m4 <- anova(m5.glm, m4.glm, test = "Chi")
m6.m5 <- anova(m6.glm, m5.glm, test = "Chi")
m7.m6 <- anova(m7.glm, m6.glm, test = "Chi")
# model 5: m4.glm is our final model
# create a list of the model comparisons
mdlcmp <- list(m1.m0, m2.m1, m3.m2, m4.m3, m5.m4, m6.m5, m7.m6)
# apply function
mdl.cmp.glm.swsu <- mdl.fttng.glm.swsd(mdlcmp)
# inspect output
mdl.cmp.glm.swsu

write.table(mdl.cmp.glm.swsu, "C:\\03-MyProjects\\07IntensifierAcquisition\\Article/mdl_cmp_glm_swsd.txt", sep="\t")
###########################################################################
mfin.glm <- glm(absint ~ agecat + motrat + styp, family = binomial, data = data2)
# rename final minimal adeqaute model
mlr.glm <- mfin.glm

# test if the final minimal adequate model performs better than the base-line model
anova(mlr.glm, m0.glm, test = "Chi")

# inspect results of the final minimal adequate model
print(mlr.glm, corr = F)

# alternative result display (anova)
anova(mlr.glm)

# alternative result display (anova)
anovasummary <- Anova(mlr.glm, type = "III", test = "Wald")

# extract the parameters of the fixed effects for the report
# to do that, we compare the model with only the random effect to a model with
# the random effect and the fixed effect for age
age.effect.glm <- anova(m0.glm, m1.glm, test = "Chi") # effect of age

# we test the effect of motrat, by adding stype and compare a
# model with motrat to a model without motrat
motrat.effect.glm <- anova(m2.glm, m3.glm, test = "Chi") # effect of motrat

# we test the effect of styp, by adding stype and compare a
# model with styp to a model without styp
styp.effect.glm <- anova(m4.glm, m3.glm, test = "Chi") # effect of styp

###########################################################################
### --- extracting and calculating model fit parameters
# we now create a lmr object equivalent to the final minimal adequate model
# but without the random effect
mlr.lrm <- lrm(absint ~ agecat + motrat + styp, data = data2, x = T, y = T)
# we now create a lmer object equivalent to the final minimal adequate model
mlr.lmer <- lmer(absint ~ agecat + motrat + styp + (1|chi), data = data2, family = binomial)

# now we check if the fixed effects of the lrm and the lmer model correlate (cf Baayen 2008:281)
cor.test(coef(mlr.lrm), fixef(mlr.lmer))

# the fixed effects correlate very strongly - this is good as it suggests that
# the coefficient estimates are very stable
# calculate prediction accuracy
pred <- as.vector(unlist(sapply(fitted(mlr.glm), function(x){
  x <- ifelse(x > .5, 1, 0) } )))
accuracy <- table(pred, data2$absint)
accuracy <- sum(diag(accuracy))/sum(accuracy)
accuracy <- round(accuracy*100, 2)
# extract summary of final model
blrm.summary <- blrm.summary(mlr.glm, mlr.lrm, accuracy)
blrm.summary

write.table(blrm.summary, "C:\\03-MyProjects\\07IntensifierAcquisition/blrmsummary.txt", sep="\t")
###########################################################################
# we activate the package Hmisc (if not already active)
library(Hmisc)
# we now extract model fit parameters (cf Baayen 2008:281)
probs = 1/(1+exp(-fitted(mlr.glm)))
probs = binomial()$linkinv(fitted(mlr.glm))
somers2(probs, as.numeric(data2$absint))

# the model fit values indicate a very good fit:
# C
# "The measure named C is an index of concordance between the predicted
# probability and the observed response. [...] When C takes the value 0.5, the
# predictions are random, when it is 1, prediction is perfect. A value above
# 0.8 indicates that the model may have some real predictive capacity."
# (Baayen 2008:204)
# "Somers’ Dxy,
# a rank correlation between predicted probabilities and observed responses, [...]
# ranges between 0 (randomness) and 1 (perfect prediction)." (Baayen 2008:204)

# extract Pseudo R^2 values for binomial logistic mixed effects model
my.lmer.nagelkerke(c("absint ~ agecat + motrat + styp", "(1 | childid)"), data)

###########################################################################
# model diagnostics: plot fitted against residuals
plot(mlr.glm)


###############################################################
#inspect the results meblrm.summary(glm0, glm1, glmer0, glmer1, dpvar)
meblrmsc <- meblrm.summary(m0.glm, m1.glm, m0.glmer, mlr.glmer, data$intcmsm) #
write.table(meblrmsc, "C:\\03-MyProjects\\07IntensifierAcquisition\\Article/meblrmsc.txt", sep="\t")

###########################################################################
###########################################################################
###########################################################################
### --- EXPLANATION of MODEL FIT PARAMETERS
###########################################################################
###########################################################################
###########################################################################
# R2 (Hosmer & Lemeshow)
# "Rt is the proportional reduction in the absolute value of the log-intlihood
# measure and as such it is a measure of how much the badness of fit improves
# as a result of the inclusionof the predictor variables. It can vary between 0
#(indicating that the predictors are useless at predicting the outcome variable)
# and 1 (indicating that the model predicts the outcome variable perfectly)"
# (Field, Miles & Field 2012:317).
###########################################################################
# R2 (Cox & Snell)
# "Cox and Snell's R~s (1989) is based on the deviance of the model (-2LL(new»)
# and the deviance of the baseline model (-2LL(baseline), and the sample size,
# n [...]. However, this statistic never reaches its theoretical maximum of 1.
###########################################################################
# R2 (Nagelkerke)
# Since R2 (Cox & Snell) never reaches its theoretical maximum of 1,
# Nagelkerke (1991) suggested Nagelkerke's R^2. (Field, Miles & Field 2012:317-318).
###########################################################################
# Somers’ Dxy
# Somers’ Dxy is a rank correlation between predicted probabilities and observed
# responses ranges between 0 (randomness) and 1 (perfect prediction). (Baayen 2008:204)
###########################################################################
# C
# C is an index of concordance between the predicted probability and the
# observed response. When C takes the value 0.5, the predictions are random,
# when it is 1, prediction is perfect. A value above 0.8 indicates that the
# model may have some real predictive capacity. (Baayen 2008:204)
###########################################################################
# AIC
# Akaike information criteria (AlC = -2LL + 2k): "contains more predictor variables.
# You can think of this as the price you pay for something: you get a better
# value of R2, but you pay a higher price, and was that higher price worth it?
# These information criteria help you to decide.model. The BIC is the same as
# the AIC but adjusts the penalty included in the AlC (i.e., 2k) by the number
# of cases: BlC = -2LL + 2k x log(n) in which n is the number of cases in the
# model." (Field, Miles & Field 2012:318).
################################################################
################################################################
################################################################
################################################################
### --- VERY IMPORTANT OBJECTS
################################################################
################################################################
################################################################
# inspect very important objects
head(data)

int.df1

# glmer
mdl.cmp.glmersc.swsu

mdl.cmp.glmersc.swsu.gries

age.effect.glmersc

styp.effect.glmersc

stypegender.effect.glmersc

meblrm.summary(m0.glm, m1.glm, m0.glmer, mlr.glmer, data$intcmsm)

anovasummary

###############################################################
###############################################################
###############################################################
### ---                   THE END
###############################################################
###############################################################
###############################################################