Delete wikify_all_server5543.R

f8a67228 · Vince Trost · 9288f582 · 9288f582
Commit f8a67228 authored 6 years ago by Vince Trost
--- a/wikify_all_server5543.R
+++ b/wikify_all_server5543.R
-#!/usr/local/bin Rscript
-
-setwd("/mnt/")
-#load R packages
-library(stringi)
-library(reticulate)
-library(data.table)
-library(doFuture)
-#library(future.apply)
-
-Sys.setenv(RETICULATE_PYTHON = '/usr/bin/python')
-
-#load functions
-source_python("~/nlp-pipeline/api/findPhrase.py")
-message("Reading page.csv in...")
-page<-fread("page.csv")
-message("Done.")
-
-#set keys
-setkey(page, Ind)
-setindex(page, vocabword)
-setindex(page, views)
-setindex(page, is_rd)
-
-
-collapse_spans<- function(tok_span_tab){
-    tab<-tok_span_tab
-    tab$width<-tab$end-tab$start+1
-    setkey(tab,start,end)
-    chunks<-tab[,.(start=min(start), end=max(end)),
-                by=.(group=cumsum(c(1, sort(tail(tab$start, -1))
-                                    > sort(head(tab$end, -1)))))]
-    setkey(tab,start,end)
-    setkey(chunks,start,end)
-    ovlap<-foverlaps(tab, chunks, type="any", which=TRUE)
-    tab$group<-ovlap$yid
-    setindex(tab,width,links,views,start,group)
-    tab<-tab[tab$links>0,]
-    setorder(tab,-width,-links,-views)
-    tab<-tab[!duplicated(tab$group),]
-    setorder(tab,start)
-    tab<-tab[,group :=NULL]
-    tab<-tab[,width :=NULL]
-    data.table(tab)
-}
-
-Wikify<-function(text){
-    text<-tolower(text)
-    text<-iconv(text, from = "UTF-8", to = "ASCII", sub = "")
-    #get rid of the wiki/'s'
-    text <- gsub("wiki/", "", text)
-    #take out all special characters
-    text <- gsub("[[:punct:]]", " ", text)
-    NPh<-find_phrases(text)$voc_tab
-    NPh<-data.table(py_to_r(NPh))
-    NPh$start<-NPh$start+1
-    out<-NULL
-    if (dim(NPh)[1]>0){
-      out<-merge(NPh,page[NPh$vocabword, on = "vocabword"],all.x=T,by='vocabword',allow.cartesian = T)
-      out<-out[!is.na(page_id)]
-      out<-setorder(out,namespace,is_rd,-views)
-      out<-out[,.(start,end,page_id,vocabword,token,views,links,is_rd,dis_id)]
-      out<-out[!duplicated(out[,.(start,end,vocabword)])]
-      out<-setorder(out,start)
-      if (dim(out)[1]>0){
-        out<-collapse_spans(out)
-      }
-    }
-    wordDT<-stri_split_fixed(text," ")
-    words<- data.table(unlist(wordDT))
-    words$sentence_ind = rep(as.numeric(1:nrow(words)))
-    words$V1[ out$start ] = paste0("wiki/", out$token)
-    if (length(out$end[which(out$start != out$end)]) > 0) {
-      words<-words[ -out$end[which(out$start != out$end)] ]
-    }
-    sentence = paste(words$V1, collapse = " ")
-    return(sentence)
-}
-
-main <- function(){
-  #there are 93716482 sentences in the 6 out files
-  file <- "out.5543"
-
-  # print message to user and read in data
-  print(paste0("*** Now working on ", file, " ***"))
-
-  #pick a chunk size  to start and batch size
-  n_todo<-as.numeric(system(paste0("cat /mnt/", file, " | wc -l"),
-                            intern = TRUE))
-
-  #initialize other parameters
-  resp_chain<-1
-  c_chunk_size<-70000
-  param_chain<-NULL
-
-  while(n_todo > 0){
-
-
-    #set up the parallel back end
-    workers = 32
-    cores = 32
-    registerDoFuture()
-    plan(multicore, workers = workers)
-    options(future.globals.maxSize= 2500*1024^2)
-    options(future.availableCores.system=cores)
-
-    s_time <- Sys.time()
-    #progress report
-    #message(paste0("There are ", n_todo, " sentences to do..."))
-    #read in chunk of data
-    skip <- n_todo - c_chunk_size
-    #towards the end, the chunk size might be greater than n_todo making skip negative
-    #if that happens (likely), just set skip to 0
-    if(skip < 0){
-      skip <- 0
-    }
-    message("Reading the chunk of the file in...")
-    chunk <- fread(paste0("/mnt/", file),
-                  sep = "\n",
-                  header = FALSE,
-                  fill=TRUE,
-                  skip = skip,
-                  nrows = c_chunk_size)
-    colnames(chunk)[1] <- "sentence"
-    #head(chunk)
-    sentences <- NULL
-    message("Wikify-ing the chunk...")
-    #split up the wikification of the chunk
-    sentences <- foreach(i = 1:nrow(chunk), .combine = rbind, .errorhandling = "remove", .export = c("chunk", "page")) %dopar%{
-      Wikify(chunk[i])
-    }
-
-    ### TEST ###
-    #t1_time <- Sys.time()
-    #Wikify(chunk[2])
-    #t2_time <- Sys.time()
-    #t_eTime <- difftime(t2_time, t1_time, units='secs')
-    ############
-    sentences <- unclass(sentences)
-    sentences <- as.data.table(sentences)
-    message("Printing the head of sentences...")
-    head(sentences)
-    message(paste0("Writing out part of the chunk..."))
-    fwrite(sentences, "/mnt/wikified_sentences.csv", append=TRUE)
-
-    #adjust n_todo
-    n_todo <- n_todo - c_chunk_size
-
-    #calculate time stats
-    c_time<- Sys.time()
-    e_time<-difftime(c_time,s_time,units = "secs")
-    more<-(n_todo)/(c_chunk_size)
-    l_time<-round(more,0)*e_time
-    d_time<-s_time+l_time
-    left<-difftime(d_time,s_time,units="auto")
-    #calculate efficiancy stats
-    c_rate<-unclass(e_time/c_chunk_size)[1]
-    param_chain<-c_chunk_size
-
-    #report to user in message
-    timeRow  <- data.table(file = file, nworkers =  workers, ntodo = n_todo, chunkSize = c_chunk_size, timeElapsed = e_time, secondsPerSentence = (e_time/c_chunk_size))
-    fwrite(timeRow, paste0("time_stats", workers, ".csv"), append=TRUE)
-    message(paste0(n_todo, " sentences remaining  |  ",
-                   unclass(left)[1]," ",
-                   data.frame(t(unlist(attributes(left))))$units[1],
-                   " remaining", " | chunk = ",mean(param_chain)))
-  }
-}
-main()