Count word frequencies in list-of-lists-of-words

2019-04-07 23:35发布

问题:

I have this large corpus data in dataframe

res (dataframe)

text.1

1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
2  beren stuart vanuatu  monday  october       venkatesh ramesh sandeep talanki     nagaraj subject  approve  qlikview gpa access   process   form  gpa access  email  requestor  line manager   access  granted raj    add    user  qlikview workgroup    gpa access form  requestors  lim tek kon vanuatu address lini high  port vila efate title  relationship manager emerging corporates employee id  lan id limtk bsbcc  authorising manager beren stuart vanuatu     read    gpa dashboard business technical reason na  
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           text.2
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
2   kumar  santhosh   behalf  relationshipbankingfinancesupport  friday  october       venkatesh cc global business reporting subject fw  approve  qlikview gpa access  santhosh   faunt daniel png  wednesday  october     relationshipbankingfinancesupport cc amet sova subject fw  approve  qlikview gpa access    unable  approve  excel due  macro issues  process   amet sova  monday  october     faunt daniel png subject  approve  qlikview gpa access     review  attached form  click line manager approval  approve 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          text.3
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           <NA>
2   thomson owen tonga  thursday  october       venkatesh ramesh sandeep talanki     nagaraj subject  approve  qlikview gpa access   process   form  gpa access  email  requestor  line manager   access  granted raj    add    user  qlikview workgroup    gpa access form  requestors  hia viliami address head office fakafanua centre maufanga vuna road nukualofa tongatapu tonga nukualofa tongatapu title  nfc  amu manager employee id  lan id hiav bsbcc   authorising manager thomson owen tonga     read    gpa dashboard business technical reason  

1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <NA>
2  kumar rajesh fiji  tuesday  october       venkatesh ramesh sandeep talanki     nagaraj subject  approve  qlikview gpa access   process   form  gpa access  email  requestor  line manager   access  granted raj    add    user  qlikview workgroup    gpa access form  requestors  fong vincent address level  anz house victoria parade suva suva viti levu title  national manager commercial banking fiji employee id  lan id fongv bsbcc  authorising manager kumar rajesh fiji     read    gpa dashboard business technical reason  user  
                                                                                                                                                                               text.5
1                                                                                                                                                                                <NA>
2  dennis david timor  thursday  october     buchanan geoffrey solomon islands subject  approve  qlikview gpa access     review  attached form  click line manager approval  approve 
                                                                                              text.6
1                                                                                               <NA>
2  matthey christopher  wednesday  october   pm  parrott louise subject document  file documentzip  
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    text.7
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
2   tan jasmine  thursday  october   pm  global business reporting cc tan yong hoong rai dinkar subject  rtc view report  sep  sensitivity confidential  team  don’  access   sharepoint link    arrange   access   jasmine  ayyamperumal rajendran ramesh kumar  behalf  global business reporting  tuesday  october   pm  kumar gaurav hong kong  tan jasmine seah linda shroff manish behan thibault hong kong  clay iv william cc tan yong hoong rai dinkar tan matthew rb finance  sim sui poh subramanian raghuveer   murugeshaiah sunil subject rtc view report  sep  sensitivity confidential  october  dear    attached   sharepoint  report   rtc portfolio     client list   august report    discussed   individual reviews      rtc financials     full client financials  pivot table   excel file    metrics    clients  note   report  based   rtc client list   dinkar  queries    client list         review   list    december reporting excel file   worksheets rtc summary default income measure product details pivot table  product   measures  rtc data  detail client level data grouping  rtc  rtc methodology explained          queries  email global business reporting     issues accessing reports   sharepoint sharepoint link ø gaurav kumar ø jasmine tan ø linda seah ø manish shroff ø thibault behan ø william clay   global business reporting team
                                                                                                                                                                                                                                           text.8
1                                                                                                                                                                                                                                            <NA>
2  deo ravinesh  friday  october       venkatesh global business reporting cc monteleone elif kabyanga isaac pinto rufus kiribati  kumar  santhosh  subject   approve  qlikview gpa access  team     assist rufus ceo kiribati   gpa access  ravi
                                                                                                                                           text.9
1                                                                                                                                            <NA>
2  epoa regina  thursday  october     relationshipbankingfinancesupport subject gpa analysis    filled  form        reports  assist cheers regina
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                text.10
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  <NA>
2 original message  tseng rickson  thursday  october   pm  global business reporting cc kumar  santhosh   wong toto subject fw gpa importance high  santhosh  venkatesh   quickly grant  global iib gpa access  mary macleod cheers rickson original message  wong toto  thursday  october   pm  tseng rickson kumar  santhosh  subject  gpa    installed qlik  marys desktop    access   account   ready  toto original message  tseng rickson  wednesday  october     kumar  santhosh  cc wong toto subject  gpa  santhosh     email gbr mailbox   requesting marys iib cfo access  gpa     helping   setup cheers rickson original message  kumar  santhosh   wednesday  october     tseng rickson cc wong toto subject  gpa  rickson  continue  email global business report mailbox venkatesh    cover  work   find  replacement  sandeep  software package  windows   package apcqliktechintabqvpluginsetupr  santhosh original message  tseng rickson  tuesday  october   pm  kumar  santhosh  cc wong toto subject  gpa  santhosh        sandeep  left  bank  dont  whats  software package    win  gpa plugin   dont    grant access  mary cheers rickson original message  wong toto  tuesday  october   pm  tseng rickson subject fw gpa  rickson   advise  software package    upgrade marys desktop  win  week    add  package    ready  toto original message  yip vivian  tuesday  october   pm  wong toto subject fw gpa  toto     gpa installed  mary macleods desktop computerbefore friday  october rickson       computer   lan id    window version  order    installation  advise    vivian yip  executive assistant  mr gilles planté  deputy ceo iib  anz   exchange square  connaught place central hong kong phone    original message  broker ali  tuesday  october   pm  yip vivian tseng rickson li shirley cc macleod mary scott nicola subject gpa  vivian    gpa   installed  marys laptop    installed    rickson  spend  minutes  mary           week  mary   hk  ali
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        text.11
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          <NA>
2   ang vanessa  tuesday  october     global business reporting subject  discontinued  commercial fum performance report  monday  october   team  advise    reach   moving forward   information required   vanessa  ayyamperumal rajendran ramesh kumar  behalf  global business reporting  tuesday  october     au yeung ivy bhuta chintan chang frank chok christopher chuang jacky china  dyer andy goyal aseem gupta vivek jiang charles li shirley lim jasmine ec  loh jonathan mcleod donnelle miller greg singapore  praswitrianto rama roumier frederic hong kong  leoni kelly hong kong  runciman gary hong kong  shankar vijay soh serene tong nelson tran dang cecilia tseng rickson yeh anita yeung jonathan hong kong  tse ying tin yew lolita ang benedict hong kong  lea danay lin gloria tong mike chuang jacky china  chen carrie china  poon yen chi anita qian jack chow frankie jiang helen china  oum morokot dith sochal kheng sopheakchenda wong theodore foo chang horng bhattacharya arnab truong kent hong kong  chan vincent cy hong kong  skien craig hong kong  lau vincent yeung jonathan hong kong  sum selina chok christopher yau emily lee irene hong kong  chung margaret lam betty turel kaiwan chan david hong kong  chak katherine cheng wilson hong kong  chiu polly dhupar karan chow ruskin hong kong  wong sunny minam saud fiji  damayanti meirina eka bahashwan rifai venkatesh shailesh sucianto lucy kartadinata paul tye alan ng wee lee diana ang sarup adesh lim jasmine ec  yeoh  hin ler adrain ang vanessa vu pham linh phuong tran thi sinh vietnam  bui thanh van nadarajah lavanya vietnam  lee john   chu sally chou peter huang sophia   tw tb  lin lydia  chang richard hsu ken huang michelle chow winnie tw tb cc mathad vijayakumar kumar  santhosh   subramanian raghuveer   mohan  durga subject discontinued  commercial fum performance report  monday  october  monday  october    commercial fum performance report      forward due  change   business structure   back    friday  oct      global business reporting  anz support services india manyata embassy business park bangalore  email global business reporting

from this dataframe i have extracted which words i need

pattern<- "([a][c][c][e][s][s]|[r][e][p][o][r][t]|[d][a][t][a])"

O<-lapply(res, function(x) str_extract_all(x,pattern) )

result EDIT

   $text
$text[[1]]
[1] "access" "access" "access" "access"

$text[[2]]
[1] "report" "access" "access" "access"

$text[[3]]
[1] "access" "access" "access" "access"

$text[[4]]
[1] "access" "access" "access" "access"

$text[[5]]
[1] "report" "access" "access" "access" "access" "access" "access" "access"

$text[[6]]
 [1] "report" "access" "access" "report" "access" "access" "access" "access" "access" "access"

$text[[7]]
 [1] "report" "report" "access" "access" "report" "report" "report" "report" "report" "report" "data"   "data"   "report"
[14] "access" "report" "report"

$text[[8]]
[1] "report" "access" "access"

$text[[9]]
[1] "report" "access" "access" "access" "report"

$text[[10]]
[1] "report" "access" "access" "access" "report" "access"

In this i want to count the occurrence of each word
I have used str_count to achieve this but was not helpfull. and i found many word count related Q in STO but not in R for list kind.

dd<-lapply(O,function(x) c<-str_count(x))

Or can i calculate the frequency of each words of each list? i used termFrequency but doesn't support my version R 3.1.0.

 O <- structure(list(text= list(c("access", "access","access", "access"),
                               c("report","access", "access", "access"), 
                               c("access","access", "access", "access"), 
                               c("access","access", "access", "access"), 
                               c("access"), 
                               c(character(0)), 
                               c("report", "report", "access", "access","report", "report", "report", "report", "report", "report", 
                                   "data", "data", "report", "access", "report", "report"), 
                               c("report", "access","access"), 
                               c("report"), c("report", "access", "access", "access", "report","access"))))

refered this STO and tried with frq1 <- findFreqTerms(O) not working

回答1:

Ok tell me how that would work for you.

Using this data:

O <- structure(list(text.1 = list(character(0), c("access", "access", 
"access", "access")), text.2 = list(character(0), c("report", 
"access", "access", "access")), text.3 = list(character(0), c("access", 
"access", "access", "access")), text.4 = list(character(0), c("access", 
"access", "access", "access")), text.5 = list(character(0), "access"), 
    text.6 = list(character(0), character(0)), text.7 = list(
        character(0), c("report", "report", "access", "access", 
        "report", "report", "report", "report", "report", "report", 
        "data", "data", "report", "access", "report", "report"
        )), text.8 = list(character(0), c("report", "access", 
    "access")), text.9 = list(character(0), "report"), text.10 = list(
        NULL, c("report", "access", "access", "access", "report", 
        "access"))), .Names = c("text.1", "text.2", "text.3", 
"text.4", "text.5", "text.6", "text.7", "text.8", "text.9", "text.10"
))

Since it seems the words are always in the second element of the text.x lists, we'll take those words and put them in a newlist. More than that, we'll turn those data into factors so we can regroup them into a dataframe later on.

newlist <- list()

for(item in O) {
    newlist[[length(newlist)+1]] <- factor(item[[2]], levels = c("access", "data", "report"))
}

dd <- data.frame(lapply(newlist, table))
dd <- t(as.matrix(dd[,c(2,4,6,8,10,12,14,16,18,20)]))

rownames(dd) <- paste0("Text.",1:10)
colnames(dd) <- c("access", "data", "report")

dd

#         access data report
# Text.1       4    0      0
# Text.2       3    0      1
# Text.3       4    0      0
# Text.4       4    0      0
# Text.5       1    0      0
# Text.6       0    0      0
# Text.7       3    2     11
# Text.8       2    0      1
# Text.9       0    0      1
# Text.10      4    0      2