################################################################## ### --- R-Skript "Combining biodata and word counts for speakers ### --- represented in the International Corpus of English (ICE) with R" ### --- Author: Martin Schweinberger (Dec 18th, 2013) ### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport" ### --- This R retrieves number of words for each ### --- speaker in the ICE corpus and ### --- merges the word counts with the biodata of the speakers ### --- provided by the compilers of the components of the ICE. ### --- NOTE ### --- Speakers who do not occur in the corpus but are included ### --- in bioinfo provided by the ICE team are left out of the final ### --- bioinfo spreadsheet!!! ### --- Words uttered by extra corpus speakers (annotation: ### --- start = ; end = ) are not considered in the word counts. ### --- Abbreviated forms ('ve, 're, 's etc.) are not considered full ### --- words but are regarded as part of the words to which they ### --- are attached, e.g. that's = 1 token/word; I've = 1 token/word) ### --- Each conversation is treated individually, i.e. for a file which ### --- contains several conversations among distinct or even ### --- the same speakers, the word counts for each conversation ### --- will be extracted separately from the other conversations ### --- in that file. ### --- CONTACT ### --- If you have questions,suggestions or you found errors ### --- or in case you would to provide feedback, questions ### --- write an email to ### --- martin.schweinberger.hh@gmail.com ### --- CITATION ### --- If you use this script or results thereof, please cite it as: ### --- Schweinberger, Martin. 2013. " Combining biodata and word ### --- counts for speakers represented in the International Corpus of English ### --- (ICE) with R ", unpublished R-skript, Hamburg University. ### --- ACKNOWLEDGEMENTS ### --- I want to thank the compilers of the ICE components for providing me ### --- with the raw biodata of the speakers represented by the ### --- ICE data - without their generousity, this script would not exist. ############################################################### ############################################################### ### START ############################################################### # Remove all lists from the current workspace rm(list=ls(all=T)) # Install packages #install.packages("tm") #install.packages("stringr") #install.packages("gsubfn") #install.packages("plyr") #install.packages("reshape") #install.packages("zoo") # Load packages library(tm) library(stringr) library(gsubfn) library(plyr) library(reshape) library(zoo) ############################################################### # Setting options options(stringsAsFactors = F) # Specify pathnames of the corpra corpus.nz <- "C:\\PhD\\skripts n data\\corpora\\ICE New Zealand\\Spoken" # Define input pathname of raw biodata bio.nz <- "C:\\PhD\\skripts n data\\corpora\\ICE New Zealand/NZGUIDE.txt" # Define outputpath of final biodata out.nz <- "C:\\MeineHomepage\\docs\\data/biodata ice new zealand.txt" ############################################################### ############################################################### ############################################################### ### ICE New Zealand ############################################################### ### START ############################################################### # Prepare for loading corpus # Choose the files you would like to use corpus.files = list.files(path = corpus.nz, pattern = NULL, all.files = T, full.names = T, recursive = T, ignore.case = T, include.dirs = T) ############################################################### # Load and unlist corpus corpus.tmp <- lapply(corpus.files, function(x) { scan(x, what = "char", sep = "\t", quiet = T) } ) corpus.tmp <- unlist(corpus.tmp) # Paste all elements of the corpus together corpus.tmp1 <- paste(corpus.tmp, collapse = " ") # Clean corpus corpus.tmp2 <- enc2utf8(corpus.tmp1) corpus.tmp2 <- gsub(" {2,}", " ", corpus.tmp2) corpus.tmp2 <- str_replace_all(corpus.tmp2, fixed("\n"), " ") corpus.tmp2 <- str_trim(corpus.tmp2, side = "both") ############################################################### # Specify searchpattern splitpattern1 = "" # Split corpus corpus.tmp4 <- strsplit(as.character(corpus.tmp2), splitpattern1) # Specify search pattern splitpattern2 = " " # Splits corpus into parts corpus.tmp5 <- lapply(corpus.tmp4, function(x) { strsplit(as.character(x), splitpattern2) } ) # Extract file.ids file.ids.tmp1 <- lapply(corpus.tmp5, function(x) { sapply(x, "[[", 2) } ) # Clean file ids file.ids.tmp2 <- lapply(file.ids.tmp1, function(x) { x <- str_trim(x) x <- gsub("#.*", "", x) x <- gsub(".*<", "", x) } ) file.ids <- as.vector(unlist(file.ids.tmp2)) # Extract subfile.ids subfile.ids.tmp1 <- unlist(file.ids.tmp2) subfile.ids.tmp2 <- sapply(as.vector(table(subfile.ids.tmp1)), function(x) { x <- str_replace_all(x, "2","1 2") x <- str_replace_all(x, "3","1 2 3") x <- str_replace_all(x, "4","1 2 3 4") x <- paste(x, collapse = " ") x <- strsplit(as.character(x), " ") x <- unlist(x) x <- sapply(x, "[", 1) } ) subfile.ids <- as.vector(unlist(subfile.ids.tmp2)) # Transform corpus.tmp6 into a data frame corpus.tmp7 <- as.data.frame(corpus.tmp5) # Store results in vector corpus.tmp8 <- corpus.tmp7[2, c(1:length(corpus.tmp7))] # Convert into character strings corpus.tmp9 <- as.character(corpus.tmp8) # Add names to corpus.tmp9 names(corpus.tmp9) <- file.ids ############################################################### # create a table out of the results corpus.tmp10 <- as.data.frame(corpus.tmp9) corpus.tmp11 <-t(corpus.tmp10) corpus.tmp12 <- as.table(corpus.tmp11) corpus.table1 <- cbind(file.ids[1:length(file.ids)], corpus.tmp12[,1:length(file.ids)]) # Add id as a column id <- 1:length(corpus.table1[, 1]) corpus.table2 <- cbind(id, corpus.table1) corpus.table2 <- cbind(corpus.table2[, 1], corpus.table2[, 2], subfile.ids, corpus.table2[, 3]) # Add column labels colnames(corpus.table2) <- c("id", "file", "subfile", "corpusfile") # Add row labels rownames(corpus.table2) <- c(1:length(corpus.table2[,1])) corpus.table2 <- as.table(corpus.table2) ############################################################### ### --- STEP ############################################################### # Extract the corpus file all.files <- corpus.table2[1:nrow(corpus.table2), 4] # Clean corpus file all.files <- lapply(all.files, function(x) { x <- sub(".*?", "", x) } ) # Extract the speech unit count for eac h speaker turn.count <- lapply(speakers, function(x) { x <- gsub("<.*>", "1", x) } ) # Store turns in extra vector turns <- lapply(speakers.and.turns, function(x) { sapply(x, function(x) x[2]) } ) ############################################################### # Create a list with all turns but cleaned, i.e. without metas turns.clean <- lapply(turns, function(x) { x <- str_replace_all(x, "()","") # x <- gsub("<\\?>.*?", " ", x) x <- str_replace_all(x, "(<&.*/&.*>)","") x <- str_replace_all(x, "()","") x <- str_replace_all(x, "(.*?)","") x <- str_replace_all(x, "(<[a-z]{4,}.*)","") # x <- str_replace_all(x, "(<\\..*?/.>)","") # WARNING: THEORETICAL ISSUE x <- str_replace_all(x, "(.*)","") x <- str_replace_all(x, "()","") x <- str_replace_all(x, "()","") # WARNING: THEORETICAL ISSUE x <- gsub(" {2,}", " ", x) x <- gsub(" re |'re ", "'re ", x) x <- gsub(" ll |'ll ", "'ll ", x) x <- gsub(" ve |'ve ", "'ve ", x) x <- gsub(" s ", "'s ", x) x <- gsub(" d ", "'d ", x) x <- gsub(" {0,1}I m ", " I'm ", x) x <- gsub("ouldnt ", "ouldn't ", x) x <- gsub(" cant ", " can't ", x) x <- gsub("Cant ", "Can't ", x) x <- gsub("Dont ", "Don't ", x) x <- gsub(" dont ", " don't ", x) x <- gsub("Didnt ", "Didn't ", x) x <- gsub(" didnt ", " didn't ", x) x <- gsub("Isnt ", "Isn't ", x) x <- gsub(" isnt ", " isn't ", x) x <- gsub("Arent ", "Aren't ", x) x <- gsub(" arent ", " aren't ", x) x <- gsub("Wasnt ", "Wasn't ", x) x <- gsub(" wasnt ", " wasn't ", x) x <- gsub("(<.*?>)", " ", x) x <- gsub("(\\?|\\(|\\)|\\{|\\}|\\[|\\]|\\$|\\&|\\.|-|>|<|\\?|/|=|,)", " ", x) x <- gsub(" {2,}", " ", x) x <- str_trim(x) } ) ############################################################### ### --- Create a list which holds the number of words per speech unit ############################################################### # First, we tokenize the list elements tokenized <- lapply(turns.clean, function(x){ tokenized <- strsplit(x, " ") } ) # Now, we count the words elements(words) in each turn (list element) word.count <- lapply(tokenized, function(x) { sapply(x, function(y) length(y)) } ) ############################################################### # Create a list for all files in the corpus which holds the # entire speaker information (speaker, turn, turn.clean, # turn.count, word.count) ############################################################### speaker.and.unclean.turns <- mapply(cbind, speakers[], turns[], SIMPLIFY = F) speaker.both.turns <- mapply(cbind, speaker.and.unclean.turns[], turns.clean[], SIMPLIFY = F) names(speaker.both.turns) <- file.subfile.ids # Add file.subfile.ids to speaker.both.turns speaker.both.turns.subfile <- mapply(cbind, speaker.both.turns[],names(speaker.both.turns[]), SIMPLIFY = F) # Add turn.counts speaker.both.turns.subfile.and.turn.count <- mapply(cbind, speaker.both.turns.subfile [], turn.count[], SIMPLIFY = F) speakerinfo1 <- mapply(cbind, speaker.both.turns.subfile.and.turn.count[], word.count[], SIMPLIFY = F) # Add names names(speakerinfo1) <- file.subfile.ids # We now need to convert the elements of the fourth and fifth column into numeric elements speakerinfo2 <-lapply(speakerinfo1, function(x) { X <- as.data.frame(x[]) X[, 5] <- as.numeric(X[, 5]) X[, 6] <- as.numeric(X[, 6]) x <- X } ) ############################################################### # Rename data for later kwic seraches kwic.tb.ice.nz <- speakerinfo2 ############################################################### # Extract the words counts for speakers in one file word.count.result <- lapply(X = speakerinfo2, function(x) { sapply(x, function(y) as.data.frame(tapply(x[[6]], x[[1]], sum))) } ) # Simplify the results overview.word.count.results <- sapply(word.count.result, "[[", 1) # Extract the turn counts for speakers in one file turn.count.result <- lapply(X = speakerinfo2, function(x) { sapply(x, function(y) as.data.frame(tapply(x[[5]], x[[1]], sum))) } ) # Simplify the results overview.turn.count.results <- sapply(turn.count.result, "[[", 1) ############################################################### # We now want to extract the full speaker ids in vector format speaker.id.list <- lapply(speakerinfo2, function(x) { x <- x[, 1] x <- names(table(x)) } ) speakers.full.ids <- as.vector(unlist(speaker.id.list)) ############################################################### # We now want to extract the speaker ids in vector format so # that we can easily create a table out of the results speaker.id.list <- lapply(speakerinfo2, function(x) { x <- x[, 1] x <- names(table(x)) x <- gsub(".*:", "", x) x <- gsub(">", "", x) } ) speaker.ids <- as.vector(unlist(speaker.id.list)) ############################################################### # We now want to extract the subfile ids in vector format so # that we can easily create a table out of the results # First we determine how many speakers are in a file subfile.list <- lapply(speakerinfo2, function(x) { x <- x[, 1] x <- names(table(x)) x <- gsub(".*#", "", x) x <- gsub(":.*", "", x) } ) subfiles <- as.vector(unlist(subfile.list)) ############################################################## # From the speaker.ids vector, we also extract the file names file.list <- lapply(speakerinfo2, function(x) { x <- x[, 1] x <- names(table(x)) x <- gsub("#.*", "", x) x <- gsub("<", "", x) } ) files <- as.vector(unlist(file.list)) # Now, let’s extract the speaker.ids speakers <- speaker.ids ############################################################### # We now want to extract the turn counts for each speaker # in vector format so that we can easily create a table out of # the results turn.count.list <- lapply(X = overview.turn.count.results, function(x) { sapply(x, function(y) { sapply(y, "[[", 1) } ) } ) # Convert the list into a vector turn.counts <- as.vector(unlist(turn.count.list)) ############################################################### # We now want to extract the word counts for each speaker in vector format so that we can easily create a table out of the results word.count.list <- lapply(X = overview.word.count.results, function(x) { sapply(x, function(y) { sapply(y, "[[", 1) } ) } ) # Convert the list into a vector word.counts <- as.vector(unlist(word.count.list)) ############################################################### # We now want to create a table with speaker id, turn count and word count # First, we create an index id <- c(1:length(speaker.ids)) # Now, we set up the data frame speakerinfo.ice <- cbind(id, speakers.full.ids, files, subfiles, speakers, turn.counts, word.counts) speakerinfo.ice <- as.data.frame(speakerinfo.ice) colnames(speakerinfo.ice ) <- c("id", "file.speaker.id", "text.id", "subfile.id", "spk.ref", "speech.unit.count", "word.count") # View results (without empty rows) speakerinfo.ice.1 <- speakerinfo.ice[!speakerinfo.ice[, 2] == "", ] speakerinfo.ice.1[, 6] <- as.numeric(speakerinfo.ice.1[, 6]) speakerinfo.ice.1[, 7] <- as.numeric(speakerinfo.ice.1[, 7]) speakerinfo.ice.1[, 1] <- 1:length(speakerinfo.ice.1[, 1]) rownames(speakerinfo.ice.1) <- speakerinfo.ice.1[, 1] # Modify text.ids to match the text.ids from the biodata speakerinfo.ice.1[, 3] <- gsub("(-)", "", speakerinfo.ice.1[, 3]) ############################################ ### --- STEP ################################################################ # Load biodata meta.tmp1 <- scan(file = bio.nz, what = "char", sep = " ", quiet = T) meta.tmp2 <- unlist(meta.tmp1) ################################################################## # Paste all elements of the meta data together meta.tmp3 <- paste(meta.tmp2, collapse = " ") # Clean corpus meta.tmp3 <- enc2utf8(meta.tmp3) meta.tmp3 <- gsub(" {2,}", " ", meta.tmp3) meta.tmp3 <- str_replace_all(meta.tmp3, fixed("\n"), " ") meta.tmp3 <- str_replace_all(meta.tmp3, fixed("\t"), " ") meta.tmp3 <- str_trim(meta.tmp3, side = "both") meta.tmp3 <- str_replace_all(meta.tmp3, ".*Direct Conversations", "8.1 Direct Conversations ") ############################################################### # Split corpus meta.tmp4 <- strsplit( gsub("([A-Z][0-9][A-Z])", "~\\1", meta.tmp3), "~" ) # Remove all superfluous meta meta.tmp5 <- meta.tmp4[[1]][2:379] # Split file meta from speaker meta meta.tmp6 <- strsplit( gsub("(min|mins))", "\\1)~", meta.tmp5), "~" ) # Delete superfluous spaces meta.tmp7 <- lapply(meta.tmp6, function(x) { x <- gsub(" {2,}", " ", x) x <- str_trim(x, side = "both") } ) ############################################################### #REPAIR BROKEN ELEMENTS: # meta.tmp7 [[159]], meta.tmp7 [[161]], meta.tmp7 [[202]] # meta.tmp7 [[159]] meta.tmp7[[159]][2] <- meta.tmp7[[159]][1] meta.tmp7[[159]][1] <- gsub("S is a Pakeha male aged.*", "", meta.tmp7[[159]][1]) meta.tmp7[[159]][2] <- gsub(".*Prisons, 18 mins DGU011 ", "", meta.tmp7[[159]][2]) # meta.tmp7 [[161]] meta.tmp7[[161]][2] <- meta.tmp7[[161]][1] meta.tmp7[[161]][1] <- gsub("S is a Pakeha male.*", "", meta.tmp7[[161]][1]) meta.tmp7[[161]][2] <- gsub(".*Pollution Bill,5 mins DGU013 ", "", meta.tmp7[[161]][2]) # meta.tmp7 [[202]] meta.tmp7[[202]][2] <- meta.tmp7[[202]][1] meta.tmp7[[202]][1] <- gsub("A is a Pakeha male 55-59.*", "", meta.tmp7[[202]][1]) meta.tmp7[[202]][2] <- gsub(".*Cup, 5 mins 36 mins ", "", meta.tmp7[[202]][2]) meta.tmp7 <- lapply(meta.tmp7, function(x) { x <- gsub("([0-9]\\.[0-9]{1,2} [A-Z][a-z]{1,} .*)", "", x) unlist(x) } ) # meta.tmp7 [[378]][2] meta.tmp7[[378]][2] <- gsub("9 Spoken Participants In this section.*", "", meta.tmp7[[378]][2]) ############################################################### # Extract the file meta file.meta.tmp1 <- lapply(meta.tmp7, function(x) { x <- x[1] unlist(x) } ) file.meta.tmp2 <- unlist(file.meta.tmp1) # Exctract the speaker meta meta.tmp8 <- lapply(meta.tmp7, function(x) { x <- x[2] } ) # Add names to meta data names(meta.tmp8) <- file.meta.tmp2 # Extract speakers spk.tmp1 <- sapply(meta.tmp8, function(x) { x <- strsplit( gsub("([A-Z]{1,2} is [a-z]{0,2}|[A-Z],{0,1} [A-Z]{0,1},{0,1} {0,1}[A-Z]{0,1},{0,1} {0,1}[A-Z]{0,1} {0,1}[a-z]{0,3} [A-Z] are [a-z]{0,2})", "~\\1", x), "~" ) } ) # Exctract the speaker meta spk.tmp2 <- lapply(spk.tmp1, function(x) { x <- x[2:length(x)] } ) #REPAIR BROKEN ELEMENTS spk.tmp2[[103]][7] <- "X is a female" spk.tmp2[[103]][8] <- "Y is a female" spk.tmp2[[105]][9] <- "X is a female" spk.tmp2[[105]][10] <- "Y is a female" spk.tmp2[[106]][9] <- "X is a female" spk.tmp2[[106]][10] <- "Y is a female" spk.tmp2[[107]][8] <- "W is a female" spk.tmp2[[107]][9] <- "X is a female" spk.tmp2[[107]][10] <- "Y is a female" spk.tmp2[[109]][7] <- "X is a female" spk.tmp2[[109]][8] <- "Y is a female" spk.tmp2[[111]][7] <- "X is a male" spk.tmp2[[111]][8] <- "Y is a male" spk.tmp2[[112]][11] <- "X is a male" spk.tmp2[[112]][12] <- "Y is a male" spk.tmp2[[117]][12] <- "X is a male" spk.tmp2[[117]][13] <- "Y is a male" spk.tmp2[[119]][7] <- "X is a female" spk.tmp2[[119]][8] <- "Y is a female" spk.tmp2[[180]][2] <- gsub("X refers to an unknown.*", "", spk.tmp2[[180]][2]) spk.tmp2[[180]][3] <- "X is a NA" spk.tmp2[[184]][6] <- "X is a female" spk.tmp2[[184]][7] <- "Y is a female" spk.tmp2[[186]][5] <- "W is a female" spk.tmp2[[186]][6] <- "V is a female" spk.tmp2[[186]][7] <- "X is a female" spk.tmp2[[186]][8] <- "Y is a female" spk.tmp2[[191]][5] <- "W is a female" spk.tmp2[[191]][6] <- "X is a male" spk.tmp2[[191]][7] <- "Y is a male" spk.tmp2[[203]][2] <- "V is a female" spk.tmp2[[203]][3] <- "W is a female" spk.tmp2[[203]][4] <- "X is a female" spk.tmp2[[203]][5] <- "Y is a female" spk.tmp2[[213]][2] <- "U is a female" spk.tmp2[[213]][3] <- "V is a female" spk.tmp2[[213]][4] <- "W is a female" spk.tmp2[[213]][5] <- "X is a female" spk.tmp2[[213]][6] <- "Y is a female" spk.tmp2[[240]][2] <- "T is a male" spk.tmp2[[240]][3] <- "U is a male" spk.tmp2[[240]][4] <- "V is a female" spk.tmp2[[240]][5] <- "W is a male" spk.tmp2[[240]][6] <- "X is a female" spk.tmp2[[240]][7] <- "Y is a female" spk.tmp2[[265]][2] <- "X is a male" spk.tmp2[[265]][3] <- "Y is a male" spk.tmp2[[363]][3] <- "S is a male" spk.tmp2[[363]][4] <- "U is a male" spk.tmp2[[363]][5] <- "V is a male" spk.tmp2[[363]][6] <- "W is a male" ################################################################## # Setting up table # Extracting speakers speakers <- sapply(spk.tmp2, function(x) { x <- x[] } ) speakers <- as.vector(unlist(speakers)) # Counting number of speakers per file nspeakers <- sapply(spk.tmp2, function(x) { length(x) } ) nspeakers <- as.vector(unlist(nspeakers)) # Extracting file ids file.ids <- names(spk.tmp2) # Setting up table biodata.tb.tmp1 <- cbind(rep(file.ids, nspeakers), speakers) # Add spk.ref spk.ref <- gsub(" .*", "", biodata.tb.tmp1[, 2]) # Setting up table biodata.tb.tmp2 <- cbind(biodata.tb.tmp1, spk.ref) # Add age age.tmp1 <- gsub("([0-9][0-9]-[0-9][0-9])", "#\\1~", biodata.tb.tmp1[, 2]) age.tmp2 <- gsub("~.*", "", age.tmp1) age.tmp3 <- gsub(".*#", "", age.tmp2) age.index1 <- order(age.tmp3) age.tmp4 <- age.tmp3[order(age.tmp3)] age.tmp4[841:length(age.tmp4)] <- "NA" age <- age.tmp4[order(age.index1)] # Setting up table biodata.tb.tmp3 <- cbind(biodata.tb.tmp2, age) # Add sex sex.tmp1 <- gsub("(female)", "#\\1~", biodata.tb.tmp1[, 2]) sex.tmp1 <- gsub("( male)", "#\\1~", sex.tmp1) sex.tmp2 <- gsub("~.*", "", sex.tmp1) sex.tmp3 <- gsub(".*#", "", sex.tmp2) sex.index1 <- order(sex.tmp3) sex.tmp4 <- sex.tmp3[order(sex.tmp3)] sex.tmp4[841:length(sex.tmp4)] <- "NA" sex <- sex.tmp4[order(sex.index1)] sex <- str_trim(sex, side = "both") # Setting up table biodata.tb.tmp4 <- cbind(biodata.tb.tmp3, sex) # Add date date.tmp1 <- gsub("([0-9][0-9]{0,1}/[0-9][0-9]{0,1}/[0-9][0-9])", "#\\1~", biodata.tb.tmp1[, 1]) date.tmp2 <- gsub("~.*", "", date.tmp1) date.tmp3 <- gsub(".*#", "", date.tmp2) date.index1 <- order(date.tmp3) date.tmp4 <- date.tmp3[order(date.tmp3)] date.tmp4[911:length(date.tmp4)] <- "NA" date <- date.tmp4[order(date.index1)] # Setting up table biodata.tb.tmp5 <- cbind(biodata.tb.tmp4, date) # Add file file.tmp1 <- gsub("([A-Z][0-9][A-Z][0-9][0-9][0-9])", "#\\1~", biodata.tb.tmp1[, 1]) file.tmp2 <- gsub("~.*", "", file.tmp1) file <- gsub(".*#", "", file.tmp2) # Setting up table biodata.tb.tmp6 <- cbind(biodata.tb.tmp5, file) # Add subsubfile subfile.tmp1 <- gsub("([A-Z][0-9][A-Z][0-9][0-9][0-9][a-z]{0,1})", "#\\1~", biodata.tb.tmp1[, 1]) subfile.tmp2 <- gsub("~.*", "", subfile.tmp1) subfile.tmp3 <- gsub(".*#", "", subfile.tmp2) subfile.index1 <- order(subfile.tmp3) subfile.tmp4 <- subfile.tmp3[order(subfile.tmp3)] subfile.tmp5 <- gsub("([A-Z][0-9][A-Z][0-9][0-9][0-9])", "", subfile.tmp4) subfile <- as.vector(unlist(sapply(subfile.tmp5, function(x) { ifelse(x == "a", 1, ifelse(x == "b", 2, ifelse(x == "c", 3, 1))) } ))) # Setting up table biodata.tb.tmp7 <- cbind(biodata.tb.tmp6, subfile) # Add ethnicity ethnicity.tmp1 <- gsub("(.* is [a-z]{0,2} )", "\\1~", biodata.tb.tmp1[, 2]) ethnicity.tmp2 <- gsub(".*~", "", ethnicity.tmp1) ethnicity.tmp3 <- gsub("female.*", "", ethnicity.tmp2) ethnicity.tmp3 <- gsub("male.*", "", ethnicity.tmp3) ethnicity.index1 <- order(ethnicity.tmp3) ethnicity.tmp4 <- ethnicity.tmp3[order(ethnicity.tmp3)] ethnicity.tmp4[1:123] <- NA ethnicity.tmp4[238] <- NA ethnicity.tmp4[958] <- NA ethnicity.tmp4[976:1002] <- NA ethnicity.tmp5 <- ethnicity.tmp4[order(ethnicity.index1)] ethnicity.tmp5 <- str_trim(ethnicity.tmp5, side = "both") # Rename vector ethnicity <- ethnicity.tmp5 # Setting up table biodata.tb.tmp8 <- cbind(biodata.tb.tmp7, ethnicity) # Adding an id to the table biodata.tb.tmp9 <- cbind(1:length(biodata.tb.tmp8[, 1]), biodata.tb.tmp8) # Extract occupation of speaker occ.tmp1 <- biodata.tb.tmp9[, 3] occ.tmp2 <- gsub("(.*[0-9][0-9]\\-[0-9][0-9])", "", occ.tmp1) occ.tmp3 <- gsub("(\\(#.*)", "", occ.tmp2) occ.tmp4 <- gsub("(, )", "", occ.tmp3) occ.tmp5 <- gsub("(.* male.*)", "", occ.tmp4) occ.tmp6 <- gsub("(.* female.*)", "", occ.tmp5) occ.tmp7 <- gsub("(.* is the .*)", "", occ.tmp6) occ.tmp8 <- str_trim(occ.tmp7, side = "both") occupation <- occ.tmp8 # Adding occupation to the table biodata.tb.tmp10 <- cbind(biodata.tb.tmp9, occupation) # Finalize biodata table biodata.tb.tmp11 <- cbind(biodata.tb.tmp10[, c(1, 8, 9, 4, 6, 5, 10, 11, 7)]) # Add column names colnames(biodata.tb.tmp11) <- c("id", "text.id", "subfile.id", "spk.ref", "sex", "age", "ethnicity", "occupation", "date") # Final clean up biodata.tb.tmp11[, 5] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 5], function(x) { x <- gsub("NA", NA, x) } ))) biodata.tb.tmp11[, 6] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 6], function(x) { x <- gsub("NA", NA, x) } ))) biodata.tb.tmp11[, 7] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 7], function(x) { x <- gsub("NA", NA, x) } ))) biodata.tb.tmp11[, 8] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 8], function(x) { ifelse(x == "", NA, x)} ))) biodata.tb.tmp11[, 8] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 8], function(x) { x <- gsub("NA", NA, x) } ))) biodata.tb.tmp11[, 9] <- as.vector(unlist(sapply(biodata.tb.tmp11[, 9], function(x) { x <- gsub("NA", NA, x) } ))) # Rename data biodata.tmp1 <- biodata.tb.tmp11 biodata.ice.nz <- biodata.tb.tmp11 ################################################################ ### --- STEP ################################################################ # We will now join the two data sets # Transform data sets into data frames speakerinfo.ice.1 <- as.data.frame(speakerinfo.ice.1) biodata.tmp1 <- as.data.frame(biodata.tmp1) # Join data sets (without speakers that do not occur in the corpus but do occur in the biodata spreadsheet provided by the corpus compilers) RECOMMENDED biodata.ice.nz.tmp1 <- join(speakerinfo.ice.1, biodata.tmp1, by = c("text.id", "subfile.id", "spk.ref"), type = "left") # Reorganize data set biodata.ice.nz.tmp2 <- cbind(1:length(biodata.ice.nz.tmp1[, 1]), biodata.ice.nz.tmp1[, c(1:5, 9:12, 6:7)]) # Reorganize data set colnames(biodata.ice.nz.tmp2) <- c("id", "id.orig", colnames(biodata.ice.nz.tmp2)[3:12]) # Rename data biodata.ice.nz <- biodata.ice.nz.tmp2 # Inspect data #head(biodata.ice.nz) ############################################################### ############################################################### ############################################################### ### --- Important objects # ICE New Zealand #kwic.tb.ice.nz #head(kwic.tb.ice.nz) #biodata.ice.nz #head(biodata.ice.nz) ### # Save results in a txt file # Choose a file in which to store the results output.file <- file.create(out.nz, showWarnings = F) # Store the txt file in the output file write.table(biodata.ice.nz, out.nz, sep = "\t", row.names = F) ############################################################### ############################################################### ############################################################### ### --- THE END # Remove all lists from the current workspace #rm(list=ls(all=T)) ############################################################### ############################################################### ###############################################################