Error in writing data frame in R

2019-03-04 11:15发布

问题:

I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like this

file_name       Status        Page              words
test1.pdf    "Present"       test1_2,test1_4    gym,school
test2.pdf    "Not Present"     -                 -
test3.pdf    "Present"       test3_1            gym

what m I missing in this code.

here is the code

    All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

  cnt <- pdf_info(All_files[i])$pages
  print(cnt)

  for(j in seq_len(cnt)){
    img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
    text <- ocr(img_file)
    ocr_text <- capture.output(cat(text))
    check <- sapply(ocr_text, paste, collapse="")
    junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
    file.remove(junk)
    br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
    else "Present" 
    print(br)       
    if(br=="Present") {
      v1[i] <- j
      break}

    for(k in chk_words){ 
      br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
      if(br == "Present")
        ps=k
      x[[k]]=ps
      tc=unlist(unique(x))
    }




  }

  print(tc)
  Status <- if(v1[i] == 0) "Not Present" else "Present"
  pages <- if(v1[i] == 0) "-" else 
    paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
  words <- if(v1[i] == 0) "-" else word
  df <- rbind(df, cbind(file_name = basename(file_name),
                        Status, pages = pages, words = words,tc))


}

Any suggestion is appreciable.

Thanks

回答1:

Here is an option for single word

v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

    cnt <- pdf_info(All_files[i])$pages
    print(cnt)

    for(j in seq_len(cnt)){
      img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
      text <- ocr(img_file)
      ocr_text <- capture.output(cat(text))
      check <- sapply(ocr_text, paste, collapse="")
      junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
      file.remove(junk)
      br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
              else "Present" 
      print(br)       
      if(br=="Present") {
         v1[i] <- j
         break}

    }

    Status <- if(v1[i] == 0) "Not Present" else "Present"
    pages <- if(v1[i] == 0) "-" else 
     paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
    words <- if(v1[i] == 0) "-" else word
    df <- rbind(df, cbind(file_name = basename(file_name),
              Status, pages = pages, words = words))


}

-output

df
#     file_name      Status  pages  words
#1 Amenities.pdf Not Present      -      -
#2      test.pdf     Present test_2 school