updated server wikify script

b2d2a699 · Ubuntu · ddaac2cf · b2d2a699
Commit b2d2a699 authored 6 years ago by Ubuntu
--- a/wikify_all_server.R
+++ b/wikify_all_server.R
 #!/usr/local/bin Rscript

-setwd("/mnt2/")
+setwd("/mnt/")
 #load R packages
 library(stringi)
 library(reticulate)
@@ -13,7 +13,9 @@ Sys.setenv(RETICULATE_PYTHON = '/usr/bin/python')

 #load functions
 source_python("~/nlp-pipeline/api/findPhrase.py")
+message("Reading page.csv in...")
 page<-fread("page.csv")
+message("Done.")

 collapse_spans<- function(tok_span_tab){
    tab<-tok_span_tab
@@ -71,7 +73,7 @@ Wikify<-function(text){

 main <- function(){
  #there are 93716482 sentences in the 6 out files
-  flist <- list.files("/mnt2/")
+  flist <- list.files("/mnt/wikiClean/")


  #REMEMBER TO REMOVE WHEN PACKAGING UP
@@ -83,7 +85,7 @@ main <- function(){
    print(paste0("*** Now working on ", file, " ***"))

    #pick a chunk size  to start and batch size
-    n_todo<-as.numeric(system(paste0("cat /mnt2/", file, " | wc -l"),
+    n_todo<-as.numeric(system(paste0("cat /mnt/wikiClean/", file, " | wc -l"),
                              intern = TRUE))

    #initialize other parameters
@@ -113,7 +115,7 @@ main <- function(){
        skip <- 0
      }
      message("Reading the chunk of the file in...")
-      chunk <- fread(paste0("/mnt2/", file),
+      chunk <- fread(paste0("/mnt/wikiClean/", file),
                    sep = "\n",
                    header = FALSE,
                    fill=TRUE,
@@ -132,7 +134,7 @@ main <- function(){
      message("Printing the head of sentences...")
      head(sentences)
      message(paste0("Writing out part of the chunk..."))
-      fwrite(sentences, "/mnt2/wikified_sentences.csv", append=TRUE)
+      fwrite(sentences, "/mnt/wikified_sentences.csv", append=TRUE)

      #adjust n_todo
      n_todo <- n_todo - c_chunk_size
@@ -149,8 +151,8 @@ main <- function(){
      param_chain<-c_chunk_size

      #report to user in message
-      timeRow  <- data.table(file = file, nworkers =  workers, ntodo = n_todo, chunkSize = c_chunk_size, epoch = counter, timeElapsed = e_time)
-      fwrite(timeRow, "time_stats.csv", append=TRUE)
+      timeRow  <- data.table(file = file, nworkers =  workers, ntodo = n_todo, chunkSize = c_chunk_size, epoch = counter, timeElapsed = e_time, efficiency = (e_time/workers))
+      fwrite(timeRow, paste0("time_stats", workers, ".csv"), append=TRUE)
      message(paste0(n_todo, " sentences remaining  |  ",
                     unclass(left)[1]," ",
                     data.frame(t(unlist(attributes(left))))$units[1],