Skip to content
Snippets Groups Projects
Commit f8a67228 authored by Vince Trost's avatar Vince Trost
Browse files

Delete wikify_all_server5543.R

parent 9288f582
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin Rscript
setwd("/mnt/")
#load R packages
library(stringi)
library(reticulate)
library(data.table)
library(doFuture)
#library(future.apply)
Sys.setenv(RETICULATE_PYTHON = '/usr/bin/python')
#load functions
source_python("~/nlp-pipeline/api/findPhrase.py")
message("Reading page.csv in...")
page<-fread("page.csv")
message("Done.")
#set keys
setkey(page, Ind)
setindex(page, vocabword)
setindex(page, views)
setindex(page, is_rd)
collapse_spans<- function(tok_span_tab){
tab<-tok_span_tab
tab$width<-tab$end-tab$start+1
setkey(tab,start,end)
chunks<-tab[,.(start=min(start), end=max(end)),
by=.(group=cumsum(c(1, sort(tail(tab$start, -1))
> sort(head(tab$end, -1)))))]
setkey(tab,start,end)
setkey(chunks,start,end)
ovlap<-foverlaps(tab, chunks, type="any", which=TRUE)
tab$group<-ovlap$yid
setindex(tab,width,links,views,start,group)
tab<-tab[tab$links>0,]
setorder(tab,-width,-links,-views)
tab<-tab[!duplicated(tab$group),]
setorder(tab,start)
tab<-tab[,group :=NULL]
tab<-tab[,width :=NULL]
data.table(tab)
}
Wikify<-function(text){
text<-tolower(text)
text<-iconv(text, from = "UTF-8", to = "ASCII", sub = "")
#get rid of the wiki/'s'
text <- gsub("wiki/", "", text)
#take out all special characters
text <- gsub("[[:punct:]]", " ", text)
NPh<-find_phrases(text)$voc_tab
NPh<-data.table(py_to_r(NPh))
NPh$start<-NPh$start+1
out<-NULL
if (dim(NPh)[1]>0){
out<-merge(NPh,page[NPh$vocabword, on = "vocabword"],all.x=T,by='vocabword',allow.cartesian = T)
out<-out[!is.na(page_id)]
out<-setorder(out,namespace,is_rd,-views)
out<-out[,.(start,end,page_id,vocabword,token,views,links,is_rd,dis_id)]
out<-out[!duplicated(out[,.(start,end,vocabword)])]
out<-setorder(out,start)
if (dim(out)[1]>0){
out<-collapse_spans(out)
}
}
wordDT<-stri_split_fixed(text," ")
words<- data.table(unlist(wordDT))
words$sentence_ind = rep(as.numeric(1:nrow(words)))
words$V1[ out$start ] = paste0("wiki/", out$token)
if (length(out$end[which(out$start != out$end)]) > 0) {
words<-words[ -out$end[which(out$start != out$end)] ]
}
sentence = paste(words$V1, collapse = " ")
return(sentence)
}
main <- function(){
#there are 93716482 sentences in the 6 out files
file <- "out.5543"
# print message to user and read in data
print(paste0("*** Now working on ", file, " ***"))
#pick a chunk size to start and batch size
n_todo<-as.numeric(system(paste0("cat /mnt/", file, " | wc -l"),
intern = TRUE))
#initialize other parameters
resp_chain<-1
c_chunk_size<-70000
param_chain<-NULL
while(n_todo > 0){
#set up the parallel back end
workers = 32
cores = 32
registerDoFuture()
plan(multicore, workers = workers)
options(future.globals.maxSize= 2500*1024^2)
options(future.availableCores.system=cores)
s_time <- Sys.time()
#progress report
#message(paste0("There are ", n_todo, " sentences to do..."))
#read in chunk of data
skip <- n_todo - c_chunk_size
#towards the end, the chunk size might be greater than n_todo making skip negative
#if that happens (likely), just set skip to 0
if(skip < 0){
skip <- 0
}
message("Reading the chunk of the file in...")
chunk <- fread(paste0("/mnt/", file),
sep = "\n",
header = FALSE,
fill=TRUE,
skip = skip,
nrows = c_chunk_size)
colnames(chunk)[1] <- "sentence"
#head(chunk)
sentences <- NULL
message("Wikify-ing the chunk...")
#split up the wikification of the chunk
sentences <- foreach(i = 1:nrow(chunk), .combine = rbind, .errorhandling = "remove", .export = c("chunk", "page")) %dopar%{
Wikify(chunk[i])
}
### TEST ###
#t1_time <- Sys.time()
#Wikify(chunk[2])
#t2_time <- Sys.time()
#t_eTime <- difftime(t2_time, t1_time, units='secs')
############
sentences <- unclass(sentences)
sentences <- as.data.table(sentences)
message("Printing the head of sentences...")
head(sentences)
message(paste0("Writing out part of the chunk..."))
fwrite(sentences, "/mnt/wikified_sentences.csv", append=TRUE)
#adjust n_todo
n_todo <- n_todo - c_chunk_size
#calculate time stats
c_time<- Sys.time()
e_time<-difftime(c_time,s_time,units = "secs")
more<-(n_todo)/(c_chunk_size)
l_time<-round(more,0)*e_time
d_time<-s_time+l_time
left<-difftime(d_time,s_time,units="auto")
#calculate efficiancy stats
c_rate<-unclass(e_time/c_chunk_size)[1]
param_chain<-c_chunk_size
#report to user in message
timeRow <- data.table(file = file, nworkers = workers, ntodo = n_todo, chunkSize = c_chunk_size, timeElapsed = e_time, secondsPerSentence = (e_time/c_chunk_size))
fwrite(timeRow, paste0("time_stats", workers, ".csv"), append=TRUE)
message(paste0(n_todo, " sentences remaining | ",
unclass(left)[1]," ",
data.frame(t(unlist(attributes(left))))$units[1],
" remaining", " | chunk = ",mean(param_chain)))
}
}
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment