Text as Data¶
Textual data have rapidly gained attention in the communication, social, and political sciences. Without their discussion, a companion to data management would be incomplete. This chapter starts with a discussion of basic operations on character strings, such as concatenation, search and replace. It then moves on to the management of corpora of text. It also discusses routine issues in the management of textual data, such as stemming, stop-word deletion, and the creation of term-frequency matrices.
Below is the supporting material for the various sections of the chapter.
Character Strings¶
## 'length()' versus 'nchar()' #######################################################
some_great_rock_bands <- c("Led Zeppelin","Pink Floyd","Queen")
length(some_great_rock_bands)
nchar(some_great_rock_bands)
## Character vector subsets versus Substrings ########################################
some_great_rock_bands[1:2]
substr(some_great_rock_bands,start=1,stop=2)
substr(some_great_rock_bands,start=6,stop=15)
## Finding patterns within character strings and character vectors ##################
some_great_rock_bands <- c("Led Zeppelin","Pink Floyd","Queen")
grep("Zeppelin",some_great_rock_bands) # Just the indices
grep("Zeppelin",some_great_rock_bands, value=TRUE) # the elements
grepl("Zeppelin",some_great_rock_bands)
grep("[ei]n$",some_great_rock_bands,value=TRUE)
## Replacing patterns within character strings and character vectors ################
some_great_rock_bands <- c("Led Zeppelin","Pink Floyd","Queen")
sub("e","i",some_great_rock_bands)
gsub("e","i",some_great_rock_bands)
gsub("([aeiouy]+)","[\\1]",some_great_rock_bands)
## Counting words in the UK party manifesto on occasion of the 2017 election ########
# First, the data are read in
Labour.2017 <- read.csv("UKLabourParty_201706.csv",
stringsAsFactors=FALSE)
# Second, some non-ascii characters are substituted
Labour.2017$content <- gsub("\xE2\x80\x99","'",Labour.2017$content)
str(Labour.2017)
# The variable 'content' contains the text of the manifesto
Labour.2017 <- Labour.2017$content
Labour.2017[1:5]
# The headings in the manifesto are all-uppercase, this helps
# to identify them:
Labour.2017.hlno <- which(Labour.2017==toupper(Labour.2017))
Labour.2017.headings <- Labour.2017[Labour.2017.hlno]
Labour.2017.headings[1:4]
# All non-heading text is changed to lowercase
labour.2017 <- tolower(Labour.2017[-Labour.2017.hlno])
labour.2017[1:5]
# All lines that contain the pattern 'econom' are collected
ecny.labour.2017 <- grep("econom",labour.2017,value=TRUE)
ecny.labour.2017[1:5]
# Using 'strsplit()' the lines are split into words
labour.2017.words <- strsplit(labour.2017,"[ ,.;:]+")
str(labour.2017.words[1:5])
# The result is a list. We change it into a character vector.
labour.2017.words <- unlist(labour.2017.words)
labour.2017.words[1:20]
# We now count the words and look at the 20 most common ones.
labour.2017.nwords <- table(labour.2017.words)
labour.2017.nwords <- sort(labour.2017.nwords,decreasing=TRUE)
labour.2017.nwords[1:20]
Text Corpora with the tm Package¶
## A first very simple example #######################################################
# Activating the 'tm' package - you may needs to install it first.
library(tm)
# We activate the 'acq' data, a corpus of 50 example news articles
data(acq)
acq
# We take a look at the first element in the corpus, a text document:
class(acq[[1]])
acq[[1]]
inspect(acq[[1]])
# We take a look at the document metadata
meta(acq[[1]])
DublinCore(acq[[1]])
## Another example, involving data from the Manifesto Project ########################
# First, we import some Manifesto Project data
# The Manifesto Project data is contained in a collection of CSV files
csv.files <- dir("ManifestoProject",full.names=TRUE,
pattern="*.csv")
csv.files
# This file contains the relevant metadata:
manifesto.metadata <- read.csv("documents_MPDataset_MPDS2019b.csv",
stringsAsFactors=FALSE)
# The following code does not work, due to the peculiar structure of the CSV files
manifesto.corpus <- VCorpus(DirSource("ManifestoProject"))
# To deal with the problem created by the peculiar structure of the files, we
# define a helper function:
getMDoc <- function(file,metadata.file){
df <- read.csv(file,
stringsAsFactors=FALSE)
content <- paste(df[,1],collapse="\n")
fn <- basename(file)
fn <- sub(".csv","",fn,fixed=TRUE)
fn12 <- unlist(strsplit(fn,"_"))
partycode <- as.numeric(fn12[1])
datecode <- as.numeric(fn12[2])
year <- datecode %/% 100
month <- datecode %% 100
datetime <- ISOdate(year=year,month=month,day=1)
mf.meta <- subset(metadata.file,
party==partycode & date == datecode)
if(!length(mf.meta$language))
mf.meta$language <- "english"
PlainTextDocument(
content,
id = fn,
heading = mf.meta$title,
datetimestamp = as.POSIXlt(datetime),
language = mf.meta$language,
partyname = mf.meta$partyname,
partycode = partycode,
datecode = datecode
)
}
# With the helper function we now create a corpus of UK manifestos:
UKLib.docs <- lapply(csv.files,getMDoc,
metadata.file=manifesto.metadata)
UKLib.Corpus <- as.VCorpus(UKLib.docs)
UKLib.Corpus
UKLib.Corpus[[14]]
# We need to deal with the non-ASCII characters, so we define yet another helper
# function:
handleUTF8quotes <- function(x){
cx <- content(x)
cx <- gsub("\xe2\x80\x98","'",cx)
cx <- gsub("\xe2\x80\x99","'",cx)
cx <- gsub("\xe2\x80\x9a",",",cx)
cx <- gsub("\xe2\x80\x9b","`",cx)
cx <- gsub("\xe2\x80\x9c","\"",cx)
cx <- gsub("\xe2\x80\x9d","\"",cx)
cx <- gsub("\xe2\x80\x9e","\"",cx)
cx <- gsub("\xe2\x80\x9f","\"",cx)
content(x) <- cx
x
}
# Another helper function is needed to change the texts into lowercase:
toLower <- function(x) {
content(x) <- tolower(content(x))
x
}
UKLib.Corpus.processed <- tm_map(UKLib.Corpus,handleUTF8quotes)
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,toLower)
inspect(UKLib.Corpus.processed[[14]])
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,removeNumbers)
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,removePunctuation)
inspect(UKLib.Corpus.processed[[14]])
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,removeWords,stopwords("english"))
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,stripWhitespace)
inspect(UKLib.Corpus.processed[[14]])
UKLib.Corpus.processed <- tm_map(UKLib.Corpus.processed,stemDocument)
inspect(UKLib.Corpus.processed[[14]])
# After preprocessing the text documents we obtain a document-term matrix:
UKLib.dtm <- DocumentTermMatrix(UKLib.Corpus.processed)
UKLib.dtm
# The various preprocessing steps can be combined into a single step:
UKLib.dtm <- DocumentTermMatrix(
tm_map(UKLib.Corpus,handleUTF8quotes),
control=list(
tolower=TRUE,
removePunctuation=TRUE,
removeNumber=TRUE,
stopwords=TRUE,
language="en",
stemming=TRUE
))
Improvements Provided by the quanteda Package¶
## A basic example using the 'quanteda' package #############################################
library(quanteda)
quanteda_options(print_corpus_max_ndoc=3)
# This is an example corpus contained in the 'quanteda' package
data_corpus_inaugural
mode(data_corpus_inaugural)
class(data_corpus_inaugural)
data_corpus_inaugural[1:3]
str(docvars(data_corpus_inaugural))
docvars(data_corpus_inaugural,"Year")
data_corpus_inaugural$Year
corpus_subset(data_corpus_inaugural, Year > 1945)
subset.corpus <- function(x,...) corpus_subset(x,...)
subset(data_corpus_inaugural, Year > 1945)
docs_containing <- function(x,pattern,...) x[grep(pattern,x,...)]
c_sub <- docs_containing(data_corpus_inaugural,"[Cc]arnage")
c_sub$President
inaugural_sntc <- corpus_reshape(data_corpus_inaugural,
to="sentences")
inaugural_sntc
sntcl <- cbind(docvars(inaugural_sntc),
len=nchar(inaugural_sntc))
head(sntcl)
sntcl.year <- aggregate(len~Year,data=sntcl,mean)
with(sntcl.year,
scatter.smooth(Year,len,ylab="Average length of sentences in characters"))
inaugural_ <- corpus_reshape(data_corpus_inaugural,
to="documents")
all(inaugural_$Year == data_corpus_inaugural$Year)
## Obtaining tokens in text documents using 'quanteda' ###################################
quanteda_options(print_tokens_max_ndoc=3,
print_tokens_max_ntoken=6)
inaugural_toks <- tokens(data_corpus_inaugural)
inaugural_toks
inaugural_ntoks <- sapply(inaugural_toks,
length)
inaugural_ntoks <- cbind(docvars(inaugural_toks),
ntokens = inaugural_ntoks)
with(inaugural_ntoks,
scatter.smooth(Year,ntokens,
ylab="Number of tokens per speech"))
inaugural_sntc_toks <- tokens(inaugural_sntc)
inaugural_sntc_ntoks <- sapply(inaugural_sntc_toks,
length)
inaugural_sntc_ntoks <- cbind(docvars(inaugural_sntc_toks),
ntokens = inaugural_sntc_ntoks)
with(inaugural_sntc_ntoks,
scatter.smooth(Year,ntokens,
ylab="Number of tokens per sentence"))
## Preparing Manifesto Project data for analysis ##########################################
csv.files <- dir("ManifestoProject",
full.names=TRUE,
pattern="*.csv")
length(csv.files)
# 'readtext' (a companion package for 'quanteda') is somewhat better able to
# deal with the Manfisto Project CSV files than 'tm':
library(readtext)
UKLib.rt <- readtext("ManifestoProject/*.csv",
text_field=1,
docvarsfrom="filenames",
docvarnames=c("party","date"))
nrow(UKLib.rt)
# Here we cerate an index of documents in the corpus:
UKLib.rta <- aggregate(text~party+date,
FUN=function(x)paste(x,collapse=" "),
data=UKLib.rt)
nrow(UKLib.rta)
UKLib.rta <- within(UKLib.rta,
doc_id <- paste(party,date,sep="_"))
UKLib.corpus <- corpus(UKLib.rta)
UKLib.corpus
# Here we combine metadata with the text documents:
manifesto.metadata <- read.csv("documents_MPDataset_MPDS2019b.csv",stringsAsFactors=FALSE)
docvars(UKLib.corpus) <- merge(docvars(UKLib.corpus),
manifesto.metadata,
by=c("party","date"))
str(docvars(UKLib.corpus))
# Finally we create a document-feature matrix, without punctuation, numbers,
# symbols and stopwords:
UKLib.dfm <- dfm(UKLib.corpus,
remove_punct=TRUE,
remove_numbers=TRUE,
remove_symbols=TRUE,
remove=stopwords("english"),
stem=TRUE)
str(docvars(UKLib.dfm))
# A more fine-grained control is possible using 'tokens()'
UKLib.toks <- tokens(UKLib.corpus,
remove_punct=TRUE,
remove_numbers=TRUE)
UKLib.toks
UKLib.dfm <- dfm(UKLib.toks)
UKLib.dfm <- dfm_remove(UKLib.dfm,
pattern=stopwords("english"))
UKLib.dfm <- dfm_wordstem(UKLib.dfm,language="english")
# 'quanteda' provides support for dictionaries:
milecondict <- dictionary(list(
Military=c("military","forces","war","defence","victory","victorious","glory"),
Economy=c("economy","growth","business","enterprise","market")
))
# Here we extract the frequency of tokens belonging to certain dictionaries:
UKLib.milecon.dfm <- dfm(UKLib.corpus,
dictionary=milecondict)
UKLib.milecon.dfm
time <- with(docvars(UKLib.milecon.dfm),
ISOdate(year=date%/%100,
month=date%%100,
day=1))
UKLib.ntok <- ntoken(UKLib.corpus)
milit.freq <- as.vector(UKLib.milecon.dfm[,"Military"])
econ.freq <- as.vector(UKLib.milecon.dfm[,"Economy"])
milit.prop <- milit.freq/UKLib.ntok
econ.prop <- econ.freq/UKLib.ntok
# We plot the frequency of tokens over time
op <- par(mfrow=c(2,1),mar=c(3,4,0,0))
plot(time,milit.prop,type="p",ylab="Military")
lines(time,lowess(time,milit.prop)$y)
plot(time,econ.prop,type="p",ylab="Economy")
lines(time,lowess(time,econ.prop)$y)
par(op)