##################################################################
### --- R-Skript "Combining biodata and word counts for speakers
### --- represented in the International Corpus of English (ICE) with R"
### --- Author: Martin Schweinberger (Dec 18th, 2013)
### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport"
### --- This R retrieves number of words for each
### --- speaker in the ICE corpus and
### --- merges the word counts with the biodata of the speakers
### --- provided by the compilers of the components of the ICE.
### --- NOTE
### --- Speakers who do not occur in the corpus but are included
### --- in bioinfo provided by the ICE team are left out of the final
### --- bioinfo spreadsheet!!!
### --- Words uttered by extra corpus speakers (annotation:
### --- start = ; end = ) are not considered in the word counts.
### --- Abbreviated forms ('ve, 're, 's etc.) are not considered full
### --- words but are regarded as part of the words to which they
### --- are attached, e.g. that's = 1 token/word; I've = 1 token/word)
### --- Each conversation is treated individually, i.e. for a file which
### --- contains several conversations among distinct or even
### --- the same speakers, the word counts for each conversation
### --- will be extracted separately from the other conversations
### --- in that file.
### --- CONTACT
### --- If you have questions,suggestions or you found errors
### --- or in case you would to provide feedback, questions
### --- write an email to
### --- martin.schweinberger.hh@gmail.com
### --- CITATION
### --- If you use this script or results thereof, please cite it as:
### --- Schweinberger, Martin. 2013. " Combining biodata and word
### --- counts for speakers represented in the International Corpus of English
### --- (ICE) with R ", unpublished R-skript, Hamburg University.
### --- ACKNOWLEDGEMENTS
### --- I want to thank the compilers of the ICE components for providing me
### --- with the raw biodata of the speakers represented by the
### --- ICE data - without their generousity, this script would not exist.
###############################################################
###############################################################
### START
###############################################################
# Remove all lists from the current workspace
rm(list=ls(all=T))
# Install packages
#install.packages("tm")
#install.packages("stringr")
#install.packages("gsubfn")
#install.packages("plyr")
#install.packages("reshape")
#install.packages("zoo")
# Load packages
library(tm)
library(stringr)
library(gsubfn)
library(plyr)
library(reshape)
library(zoo)
###############################################################
# Setting options
options(stringsAsFactors = F)
# Specify pathnames of the corpra
corpus.ind <- "C:\\PhD\\skripts n data\\corpora\\ICE India\\Corpus"
# Define input pathname of raw biodata
bio.ind <- "C:\\PhD\\skripts n data\\corpora\\ICE India\\Headers"
# Define outputpath of final biodata
out.ind <- "C:\\MeineHomepage\\docs\\data/biodata ice india.txt"
###############################################################
###############################################################
###############################################################
### ICE India
###############################################################
### START
###############################################################
# Prepare for loading corpus
# Choose the files you would like to use
corpus.files = list.files(path = corpus.ind, pattern = NULL, all.files = T,
full.names = T, recursive = T, ignore.case = T, include.dirs = T)
###############################################################
# Load and store corpus
corpus.tmp <- lapply(corpus.files, function(x) {
corpus.tmp <- scan(x, what = "char", sep = "\t", quiet = T) } )
corpus.tmp <- unlist(corpus.tmp)
# View results
#corpus.tmp
# Paste all elements of the corpus together
corpus.tmp1 <- paste(corpus.tmp, collapse = " ")
# Inspect the resulting file
#corpus.tmp1
# Clean corpus
corpus.tmp2 <- enc2utf8(corpus.tmp1)
corpus.tmp2 <- gsub(" {2,}", " ", corpus.tmp2)
corpus.tmp2 <- str_replace_all(corpus.tmp2, fixed("\n"), " ")
corpus.tmp2 <- str_trim(corpus.tmp2, side = "both")
# Inspect the resulting file
#corpus.tmp2
###############################################################
# Specify searchpattern
splitpattern2 = " "
# Split the corpus
corpus.tmp4 <- sapply(corpus.tmp2, function(x) {
strsplit(as.character(x), splitpattern2) } )
# Inspect the resulting object
#str(corpus.tmp4)
# Delete first element (broken)
corpus.tmp5 <- corpus.tmp4[[1]][2:length(corpus.tmp4[[1]])]
# Delete written files
corpus.tmp5 <- corpus.tmp5[1:335]
###############################################################
# Extract file ids
file.ids.tmp1 <- lapply(corpus.tmp5, function(x) {
x <- str_replace_all(x, "(#[A-Z]{0,1}[0-9]{0,4}:[0-9]{0,4}:{0,1}[A-Z]{0,1}.*)", "") } )
file.ids.tmp2 <- sapply(file.ids.tmp1, function(x) {
x <- str_replace_all(x, "(.*:)", "") } )
# View results
#file.ids.tmp2
# Extract subfile ids
subfile.ids.tmp1 <- sapply(as.vector(table(file.ids.tmp2)), function(x) {
x <- str_replace_all(x, "2","1 2")
x <- str_replace_all(x, "3","1 2 3")
x <- str_replace_all(x, "4","1 2 3 4")
x <- paste(x, collapse = " ")
x <- strsplit(as.character(x), " ")
x <- unlist(x)
x <- sapply(x, "[", 1) } )
subfile.ids.tmp2 <- sapply(subfile.ids.tmp1, function(x) {
x <- paste(x, collapse = " ")
x <- strsplit(as.character(x), " ") } )
subfile.ids.tmp3 <- sapply(subfile.ids.tmp2, function(x) {
x <- paste(x, collapse = " ") } )
subfile.ids.tmp4 <- paste(subfile.ids.tmp3, collapse = " ")
subfile.ids.tmp4 <- gsub(" {2,}", " ", subfile.ids.tmp4)
subfile.ids.tmp5 <- strsplit(as.character(subfile.ids.tmp4), " ")
subfile.ids.tmp5 <- strsplit(subfile.ids.tmp4, " ")
subfile.ids.tmp5 <- unlist(subfile.ids.tmp5)
# View results
#subfile.ids.tmp5
# Transform corpus.tmp6 into a data frame
corpus.tmp7 <- as.data.frame(corpus.tmp5)
# Create a table from the results
corpus.tb1 <- cbind(1:length(corpus.tmp7[, 1]), file.ids.tmp2, subfile.ids.tmp5, corpus.tmp7[, 1])
# Add column names
colnames(corpus.tb1) <- c("id", "file", "subfile", "corpusfile")
# Rename object
corpus.table2 <- corpus.tb1
# View results
#head(corpus.table2)
###############################################################
### --- STEP
###############################################################
# Extract the corpus file
all.files <- corpus.table2[1:nrow(corpus.table2), 4]
# View results
#str(all.files)
# Split corpus files so that each speech.unit is one element
all.files.unclean.tmp1 <- str_split(gsub("ICE-IND:", "\\1~<", all.files), "~")
all.files.unclean.tmp2 <- sapply(all.files.unclean.tmp1, function(x) {
x <- x[2:length(x)] } )
# Add names to all.files.unclean
file.subfile.ids <- apply(corpus.table2[ , c(2, 3)], 1 , paste , collapse = " " )
names(all.files.unclean.tmp2) <- file.subfile.ids
# View results
#str(all.files.unclean.tmp2)
###############################################################
# Separate the speakers from the speech.unit
speakers.and.utts <- lapply(all.files.unclean.tmp2, function(x) {
str_split(x, " ", n = 2) } )
# View results
#speakers.and.utts
# Store speakers in extra vector
speakers.tmp1 <- lapply(speakers.and.utts, function(x) {
sapply(x, "[[", 1) } )
# PROBLEM: speakers.tmp1[[8]][168] is broken!
# Repair speakers.tmp1[[8]][168]
speakers.tmp1[[8]][168] <- ""
speakers <- lapply(speakers.tmp1, function(x) {
x <- str_replace_all(x, "(<.*:)","")
x <- str_replace_all(x, "(>)","") } )
# View results
#speakers
# Store speech.units in extra vector
speech.units <- lapply(speakers.and.utts, function(x) {
sapply(x, function(x) x[2]) } )
# View results
#speech.units
###############################################################
# Create a list with all speech.units but cleaned, i.e. without metas
speech.units.clean <- lapply(speech.units, function (x){
x <- str_replace_all(x, "()","")
x <- str_replace_all(x, "(<->.*?->)","")
x <- str_replace_all(x, "(<&.*?/&.*?>)","")
x <- str_replace_all(x, "()","")
x <- str_replace_all(x, "(<\\?.*?/\\?>)","")
x <- str_replace_all(x, "()","")
# WARNING: THEORETICAL ISSUE
x <- str_replace_all(x, "(.*?)","")
x <- str_replace_all(x, "()","")
x <- str_replace_all(x, "()","")
# x <- str_replace_all(x, "()","")
x <- str_replace_all(x, "(<.*?>)", "")
# WARNING: THEORETICAL ISSUE
x <- gsub(" {2,}", " ", x)
x <- gsub(" re |'re ", "'re ", x)
x <- gsub(" ll |'ll ", "'ll ", x)
x <- gsub(" {0,1}Ill ", " I'll ", x)
x <- gsub(" ve |'ve ", "'ve ", x)
x <- gsub(" s ", "'s ", x)
x <- gsub(" d ", "'d ", x)
x <- gsub(" {0,1}I m ", " I'm ", x)
x <- gsub("Im ", " I'm ", x)
x <- gsub("Its", " It's ", x)
x <- gsub(" its", " it's ", x)
x <- gsub("Hes", " He's ", x)
x <- gsub(" hes", " he's ", x)
x <- gsub("Ive", " I've ", x)
x <- gsub(" {0,1}Thats ", " That's ", x)
x <- gsub(" thats ", " that's ", x)
x <- gsub(" {0,1}Theres ", " There's ", x)
x <- gsub(" theres ", " there's ", x)
x <- gsub("ouldnt ", "ouldn't ", x)
x <- gsub(" cant ", " can't ", x)
x <- gsub("Cant ", "Can't ", x)
x <- gsub("Dont ", "Don't ", x)
x <- gsub(" dont ", " don't ", x)
x <- gsub("Didnt ", "Didn't ", x)
x <- gsub(" didnt ", " didn't ", x)
x <- gsub("Isnt ", "Isn't ", x)
x <- gsub(" isnt ", " isn't ", x)
x <- gsub("Arent ", "Aren't ", x)
x <- gsub(" arent ", " aren't ", x)
x <- gsub(" havent ", " haven't ", x)
x <- gsub("Havent ", "Haven't ", x)
x <- gsub("Wasnt ", "Wasn't ", x)
x <- gsub(" wasnt ", " wasn't ", x)
x <- gsub("(\\?|\\(|\\)|\\{|\\}|\\[|\\]|\\$|\\&|\\.|-|>|<|\\?|/|=|,)", " ", x)
x <- gsub(" {2,}", " ", x)
x <- str_trim(x) } )
# View results
#str(speech.units.clean)
#speech.units.clean[[1]]
###############################################################
### --- Create a list which holds the number of words per speech.unit
###############################################################
# First, we tokenize the list elements
tokenized <- lapply(speech.units.clean, function(x){
tokenized <- strsplit(x, " ") } )
# View results
#tokenized
# Now, we count the words elements(words) in each speech.unit (list element)
word.count <- lapply(tokenized, function(x) {
sapply(x, function(y) length(y)) } )
# View results
#word.count
###############################################################
# Create a list which holds the number of speech.units per speech.unit
#Extract the number of speech.units for each speaker of all spoken files
utt.count.tmp1 <- lapply(word.count, function(x) {
sapply(x, function(y) gsub(".*", "1", y)) } )
#Add names (file.ids) to the number of speech.units
#names(utt.count.tmp1) <- file.id.tmp2
# Rename list
utt.count <- utt.count.tmp1
# View results
#utt.count
###############################################################
# Create a list for all files in the corpus which holds the
# entire speaker information (speaker, speech.unit, speech.unit.clean,
# utt.count, word.count)
###############################################################
speaker.and.unclean.utts <- mapply(cbind, speakers[], speech.units[], SIMPLIFY = F)
# View results
#speaker.and.unclean.utts
speaker.both.utts <- mapply(cbind, speaker.and.unclean.utts[], speech.units.clean[], SIMPLIFY = F)
names(speaker.both.utts) <- file.subfile.ids
# View results
#str(speaker.both.utts)
#speaker.both.utts
# Add file.subfile.ids to speaker.both.utts
speaker.both.utts.subfile <- mapply(cbind, speaker.both.utts[],names(speaker.both.utts[]), SIMPLIFY = F)
# View results
#speaker.both.utts.subfile
# Add utt.counts
speaker.both.utts.subfile.and.utt.count <- mapply(cbind, speaker.both.utts.subfile [], utt.count[], SIMPLIFY = F)
# View results
#speaker.both.utts.subfile.and.utt.count
speakerinfo1 <- mapply(cbind, speaker.both.utts.subfile.and.utt.count[], word.count[], SIMPLIFY = F)
# Add names
names(speakerinfo1) <- file.subfile.ids
# View results
#speakerinfo1
# We now need to convert the elements of the fourth and fifth
# column into numeric elements
speakerinfo2 <-lapply(X = speakerinfo1, function (X) {
x <- as.data.frame(X[])
x[, 5] <- as.numeric(x[, 5])
x[, 6] <- as.numeric(x[, 6])
X <- x } )
# View results
#speakerinfo2
###############################################################
# Rename data for later kwic searches
kwic.tb.ice.ind <- speakerinfo2
###############################################################
# Extract the words counts for speakers in one file
word.count.result <- lapply(speakerinfo2, function(x) {
sapply(x, function(y) as.data.frame(tapply(x[[6]], x[[1]], sum))) } )
# View results
#word.count.result
# Simplify the results
overview.word.count.results <- sapply(word.count.result, "[[", 1)
# View results
#overview.word.count.results
# Extract the speech.unit counts for speakers in one file
speech.unit.count.result <- lapply(speakerinfo2, function(x) {
sapply(x, function(y) as.data.frame(tapply(x[[5]], x[[1]], sum))) } )
# View results
#speech.unit.count.result
# Simplify the results
overview.speech.unit.count.results <- sapply(speech.unit.count.result, "[[", 1)
# View results
#overview.speech.unit.count.results
###############################################################
# Extract the speakers in one file
speaker.id.list.tmp1 <- lapply(speakerinfo2, function(x) {
sapply(x, function(y) {
x <- x[[1]] } ) } )
# View results
#speaker.id.list.tmp1
# Extract all elements from the frist column (the speaker.refs)
speaker.id.list.tmp2 <- sapply(speaker.id.list.tmp1, function(x) {
x <- x[, 1] } )
# View results
#speaker.id.list.tmp2[[1]]
speaker.ids.tmp1 <- sapply(speaker.id.list.tmp2, function(x) {
table(x) } )
# View results
#speaker.ids.tmp1
speaker.ids.tmp2 <- sapply(speaker.ids.tmp1, function(x) {
x <- names(x) } )
# View results
#speaker.ids.tmp2
# Extract the speakers and store them in a vector
speaker.ids.tmp3 <- as.vector(unlist(speaker.ids.tmp2))
# Rename vector
speaker.ids <- speaker.ids.tmp3
# View results
#speaker.ids
# Extract the number of speakers of each file
no.speakers.in.file.tmp1 <- sapply(speaker.ids.tmp2, function(x) {
x <- length(x) } )
# View results
#no.speakers.in.file.tmp1
# Repeat each file name as many times as theer are speakers in that file
full.file.ids.tmp3 <- as.vector(rep(names(no.speakers.in.file.tmp1), as.vector(as.numeric(no.speakers.in.file.tmp1))))
# View results
#full.file.ids.tmp3
# Repeat each file name as many times as theer are speakers in that file
full.file.ids.tmp4 <- str_replace_all(full.file.ids.tmp3, " ", "#")
full.file.ids.tmp5 <- paste("<", full.file.ids.tmp4, ">")
full.file.ids.tmp6 <- str_replace_all(full.file.ids.tmp5, " ", "")
# View results
#full.file.ids.tmp6
# View results
#full.file.ids.tmp4
# Extract text.ids
text.ids <- str_replace_all(full.file.ids.tmp3, " .*", "")
# Extract subfile.ids
subfile.ids <- str_replace_all(full.file.ids.tmp3, ".* ", "")
###############################################################
# We now want to extract the speech.unit counts for each speaker
# in vector format so that we can easily create a table out of
# the results
speech.unit.count.list <- lapply(overview.speech.unit.count.results, function(x) { sapply(x, function(y) { sapply(y, "[[", 1)} )})
# View results
#speech.unit.count.list
# Convert the list into a vector
speech.unit.counts <- as.vector(unlist(speech.unit.count.list))
# View results
#speech.unit.counts
###
# We now want to extract the word counts for each speaker
# in vector format so that we can easily create a table out of
# the results
word.count.list <- lapply(overview.word.count.results, function(x) {
sapply(x, function(y) { sapply(y, "[[", 1)} ) } )
# View results
#word.count.list
# Convert the list into a vector
word.counts <- as.vector(unlist(word.count.list))
# View results
#word.counts
###############################################################
# We now want to create a table with speaker id, speech.unit count
# and word count
# First, we create an index
id <- c(1:length(full.file.ids.tmp3))
# Now, we set up the data frame and combine file.ids and seakers in a table
speakerinfo.ice.india.tb.tmp1 <- as.data.frame(cbind(id, full.file.ids.tmp6, text.ids, subfile.ids, speaker.ids, speech.unit.counts, word.counts))
colnames(speakerinfo.ice.india.tb.tmp1) <- c("id", "file.speaker.id", "file", "subfile", "speakers", "speech.unit.count", "word.count")
# View results
#speakerinfo.ice.india.tb.tmp1
###############################################################
### --- STEP
###############################################################
# Prepare for loading corpus
# Choose the files you would like to use
header.files = list.files(path = bio.ind, pattern = NULL, all.files = T,
full.names = T, recursive = T, ignore.case = T, include.dirs = T)
###############################################################
# Load and store corpus (optimaler)
headers.tmp <- lapply(header.files, function(x) {
header.tmp <- scan(x, what = "char", sep = "\t", quiet = T) } )
# Deelte broken first element
headers.tmp1 <- headers.tmp[2:length(headers.tmp)]
# Select all but only spoken files
headers.tmp2 <- headers.tmp1[1:300]
headers.tmp3 <- lapply(headers.tmp2, function(x) {
paste(x, collapse = "") } )
headers.tmp4 <- lapply(headers.tmp3, function(x) {
x <- gsub(" {2,}", " ", x) } )
headers.tmp4 <- as.character(headers.tmp4)
# View results
#headers.tmp4
# Clean corpus
headers.tmp5 <- enc2utf8(headers.tmp4)
headers.tmp5 <- gsub(" {2,}", " ", headers.tmp5)
headers.tmp5 <- str_replace_all(headers.tmp5, fixed("\n"), " ")
headers.tmp5 <- str_trim(headers.tmp5, side = "both")
# Inspect the resulting file
#headers.tmp5
###############################################################
# Split the headers
headers.tmp6 <- sapply(headers.tmp5, function(x) {
str_split(gsub("<", "~\\1<", x, perl = T), "~") } )
headers.tmp6 <- sapply(headers.tmp6, function(x) {
str_trim(x, side = "both") } )
# Inspect the resulting object
#headers.tmp6[1]
headers.speakers.tmp1 <- sapply(headers.tmp5, function(x) {
str_split(gsub("ICE-IND-", "", x, perl = T)
x <- gsub(">.*", "", x, perl = T)
x <- gsub("(1)", "", x, fixed = TRUE, perl = T) } ))
# Add names to elements in headers.speakers.tmp2
names(headers.speakers.tmp2) <- headers.speakers.tmp4
###############################################################
headers.speakers.tmp5 <- sapply(headers.tmp5, function(x) {
str_split(gsub(".*", "", x, perl = T)
x <- gsub("", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub("ICE-IND-", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
date.of.recording <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
place.of.recording <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
text.category <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
file.wordcount <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
no.of.participants <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
relationship.of.participants <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
audience.size <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
communicative.situation <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
organising.body <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
copyright.statement <- rep(as.vector(sapply(headers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )), no.speakers.in.file.tmp1)
speakerinfo.ice.india.tb.tmp2 <- as.data.frame(cbind(textcode, date.of.recording, place.of.recording, text.category, file.wordcount, no.of.participants, relationship.of.participants, audience.size, communicative.situation, organising.body, copyright.statement))
# Delete dublicated rows
speakerinfo.ice.india.tb.tmp2 <- speakerinfo.ice.india.tb.tmp2[!duplicated(speakerinfo.ice.india.tb.tmp2), ]
# Inspect resulting table
#head(speakerinfo.ice.india.tb.tmp2)
###############################################################
file.ids <- names(headers.speakers.tmp9)
no.speakers.in.file.ice.india <- sapply(headers.speakers.tmp9, function(x){
length(x) } )
file.ids.tmp1 <- rep(file.ids, no.speakers.in.file.ice.india)
file.ids.tmp2 <- as.vector(sapply(file.ids.tmp1, function(x) {
x <- gsub("\\(.*", "", x, perl = TRUE) } ))
subfile.ids <- as.vector(sapply(file.ids.tmp1, function(x) {
x <- gsub(".*\\(", "", x, perl = TRUE)
x <- gsub(")", "", x, perl = TRUE, fixed = TRUE)
x <- gsub("S.*", "1", x, perl = TRUE) } ))
speaker.id <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
communicative.role <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
surname <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
forenames <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
age <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
gender <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
nationality <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
birthplace <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
education <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
occupation <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
affiliations <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
mother.tongue <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
other.languages <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
free.comments <- as.vector(unlist(sapply(headers.speakers.tmp9, function(x) {
x <- gsub(".*", "", x, perl = TRUE)
x <- gsub(".*", "", x, perl = TRUE) } )))
speakerinfo.ice.india.tb.tmp3 <- as.data.frame(cbind(file.ids.tmp2, subfile.ids, speaker.id, communicative.role, surname, forenames, age, gender, nationality, birthplace, education, occupation, affiliations, mother.tongue, other.languages, free.comments))
rownames(speakerinfo.ice.india.tb.tmp3) <- 1:length(speakerinfo.ice.india.tb.tmp3[, 1])
# Inspect resulting table
#head(speakerinfo.ice.india.tb.tmp3)
###############################################################
### --- STEP
###############################################################
# Parallelize column names by which to join the data sets
colnames(speakerinfo.ice.india.tb.tmp1)[3] <- "text.id"
colnames(speakerinfo.ice.india.tb.tmp1)[4] <- "subfile.id"
colnames(speakerinfo.ice.india.tb.tmp1)[5] <- "spk.ref"
colnames(speakerinfo.ice.india.tb.tmp2)[1] <- "text.id"
# Join the data frames
biodata.ice.ind.tmp1 <- join(speakerinfo.ice.india.tb.tmp1, speakerinfo.ice.india.tb.tmp2, by = c("text.id"), type = "left")
# Inspect resulting table
#head(biodata.ice.ind.tmp1)
#length(biodata.ice.ind.tmp1[, 1])
# Parallelize column names by which to join the data sets
colnames(biodata.ice.ind.tmp1)[5] <- "spk.ref"
colnames(speakerinfo.ice.india.tb.tmp3) [1] <- "text.id"
colnames(speakerinfo.ice.india.tb.tmp3) [2] <- "subfile.id"
colnames(speakerinfo.ice.india.tb.tmp3) [3] <- "spk.ref"
# Join the data frames
biodata.ice.ind <- join(biodata.ice.ind.tmp1, speakerinfo.ice.india.tb.tmp3, by = c("text.id", "subfile.id", "spk.ref"), type = "left")
# Fill broken cells with correct values
biodata.ice.ind <- as.matrix(t(apply(biodata.ice.ind, 1, FUN = function(x) {
x <- gsub(".*", "", x, perl = TRUE) } )), ncol = length(biodata.ice.ind[1, ]))
# Fill broken cells with correct values
biodata.ice.ind <- as.matrix(t(apply(biodata.ice.ind, 1, FUN = function(x) {
x <- gsub("Dialogue.*", "Dialogue", x, perl = TRUE) } )), ncol = length(biodata.ice.ind[1, ]))
# Fill broken cells with correct values
biodata.ice.ind <- as.matrix(t(apply(biodata.ice.ind, 1, FUN = function(x) {
x <- gsub("U\\.G\\.C\\. Refresher Course.*", "U.G.C. Refresher Course", x, perl = TRUE) } )), ncol = length(biodata.ice.ind[1, ]))
# Fill empty cells
biodata.ice.ind <- as.matrix(t(apply(biodata.ice.ind, 1, function(x) {
ifelse(x == "", NA,
ifelse(x == "NA", NA,
ifelse(is.na(x), NA, x))) } )), ncol = length(biodata.ice.ind [1, ]))
# Relable ids
biodata.ice.ind[, 1] <- 1: length(biodata.ice.ind[, 1])
# Reorder table
biodata.ice.ind <- cbind(biodata.ice.ind[, c(1:5, 8:length(biodata.ice.ind[1, ]), 6:7)])
# Reorder table
biodata.ice.ind[, c(6:length(biodata.ice.ind[1, ]))] <- tolower(biodata.ice.ind[, c(6:length(biodata.ice.ind[1, ]))])
# Inspect resulting table
#head(biodata.ice.ind)
# Recatergorize age
biodata.ice.ind[, 19] <- as.vector(unlist(sapply(biodata.ice.ind[, 19], function(x) {
ifelse(x == "18", "18-25",
ifelse(x == "19", "18-25",
ifelse(x == "21", "18-25",
ifelse(x == "22", "18-25",
ifelse(x == "23", "18-25",
ifelse(x == "18-26", "18-25",
ifelse(x == "24-33", NA,
ifelse(x == "24-41", NA,
ifelse(x == "25", "18-25",
ifelse(x == "25-33", "26-33",
ifelse(x == "26-33", "26-33",
ifelse(x == "30", "26-33",
ifelse(x == "33", "26-33",
ifelse(x == "33-41", NA,
ifelse(x == "35", "34-41",
ifelse(x == "34-51", NA,
ifelse(x == "35-41", "34-41",
ifelse(x == "36", "34-41",
ifelse(x == "39", "34-41",
ifelse(x == "40", "34-41",
ifelse(x == "41", "34-41",
ifelse(x == "40+", NA,
ifelse(x == "45+", NA,
ifelse(x == "41-49", NA,
ifelse(x == "42", "42-49",
ifelse(x == "42-49", "42-49", x <- x)))))))))))))))))))))))))) } )))
biodata.ice.ind[, 19] <- as.vector(unlist(sapply(biodata.ice.ind[, 19], function(x) {
ifelse(x == "45", "42-49",
ifelse(x == "47", "42-49",
ifelse(x == "49", "42-49",
ifelse(x == "50", "50+",
ifelse(x == "52", "50+",
ifelse(x == "52+", "50+",
ifelse(x == "53", "50+",
ifelse(x == "54", "50+",
ifelse(x == "55", "50+",
ifelse(x == "55+", "50+",
ifelse(x == "56", "50+",
ifelse(x == "57", "50+",
ifelse(x == "58", "50+",
ifelse(x == "60+", "50+",
ifelse(x == "62", "50+",
ifelse(x == "64", "50+",
ifelse(x == "65", "50+",
ifelse(x == "67", "50+",
ifelse(x == "69", "50+",
ifelse(x == "71", "50+",
ifelse(x == "78", "50+",
ifelse(x == "88", "50+", x <- x)))))))))))))))))))))) } )))
# Recatergorize mother tongue
biodata.ice.ind[, 26] <- as.vector(unlist(sapply(biodata.ice.ind[, 26], function(x) {
ifelse(x == "angami", NA,
ifelse(x == "assamese", NA,
ifelse(x == "bangla", NA,
ifelse(x == "english", NA,
ifelse(x == "bhojpuri", NA,
ifelse(x == "gujrati", NA,
ifelse(x == "hindi(bihari)", "hindi",
ifelse(x == "kashmiri", NA,
ifelse(x == "khasi", NA,
ifelse(x == "manipuri", NA,
ifelse(x == "marathi, kannada", "marathi",
ifelse(x == "marwari", NA,
ifelse(x == "naga", NA,
ifelse(x == "nepali", NA,
ifelse(x == "oriya", NA,
ifelse(x == "sindhi", NA,
ifelse(x == "tulu", NA,
x <- x))))))))))))))))) } )))
# Recatergorize date of recordng
biodata.ice.ind[, 6] <- as.vector(unlist(sapply(biodata.ice.ind[, 6], function(x) {
x <- gsub(".*-", "", x)
x <- gsub("june 1991", "91", x)
x <- gsub("9", "199", x)
x <- gsub("199199", "1999", x) } )))
###############################################################
###############################################################
###############################################################
# ICE India
#kwic.tb.ice.ind
#head(kwic.tb.ice.ind)
#biodata.ice.ind
#head(biodata.ice.ind)
###
# Save results in a txt file
# Choose a file in which to store the results
output.file <- file.create(out.ind, showWarnings = F)
# Store the txt file in the output file
write.table(biodata.ice.ind, out.ind, sep = "\t", row.names = F)
###############################################################
###############################################################
###############################################################
### --- THE END
###############################################################
###############################################################
###############################################################
# Remove all lists from the current workspace
#rm(list=ls(all=T))
###############################################################
###############################################################
###############################################################