Using casperjs and phantomjs to scrape multiple pa

2019-04-17 07:34发布

I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output file

f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();


   // this is the unique identifier for the locations. For now, I just have three datapoints
  var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"]; 

 /// this code will be used to loop through the different locations. For now, set to look at only one.  
 for (q= 0;  q < 1; q++)  {
    var processing = false;



   //we construct the target url
   var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


   page.open(url);
   page.onLoadFinished = function(status) {
   if ( status === "success" ) {
       page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
           var str = page.evaluate(function() {                   
               $value = [];
               $Object = $(".result tr");                
               for (i =0 ; i < 10; i++) { 
             $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
             $Object = $Object.next();
            } 

            $string = "{ EPAID: "+  $value[0] +  ", " + 
                     "Name: "+  $value[1] +  ", " +                
                     "City: "+  $value[4] +  ", " +
                     "State: "+  $value[6] +  ", " +
                     "ZipCode: "+  $value[8] +  ", " +  
                     "Latitude: "+  $value[14] +  ", " +
                     "Longitude: "+  $value[16] +  " }" ;          
            return $string;
        });

        f = fs.open("lat_long.txt", "a");
        f.write(str);
        f.close();
        processing = true;
        console.log("writing to file");
       phantom.exit();    

    });
 }


 // right here it should delay until the previous page is completed        
 //  while (!processing)  {    
 //       setTimeout(function(){ console.log("waiting....");},1000);
 //    }


};

}

console.log("finished all pages");

2条回答
看我几分像从前
2楼-- · 2019-04-17 08:24

Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).

With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output f
 // this is the unique identifier for the locations. 
    var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"]; 


f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();


var count = 0;
var target = 1400;
var written = [];

function yourFunction(){

   if (count < target) {

      process(count);
      count++;
      setTimeout(yourFunction, 5000);

   } else {
       console.log("exiting");
       phantom.exit();    
       return;
   }    
}




function process(counter){    

    var processing = false;

         console.log("Beginning record #" + counter); 

    //we construct the target url
    var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


    page.open(url);
    page.onLoadFinished = function(status) {
    if ( status === "success" ) {
        page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
            var str = page.evaluate(function() {                   
                $value = [];
                $Object = $(".result tr");                
              for (i =0 ; i < 10; i++) { 
                 $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
                 $Object = $Object.next();
              } 

                $string = "{ \"EPAID\": \""+  $value[0] +  "\", " + 
                         "\"Name\": \""+  $value[1] +  "\", " +                
                         "\"City\": \""+  $value[4] +  "\", " +
                         "\"State\": \""+  $value[6] +  "\", " +
                         "\"ZipCode\": \""+  $value[8] +  "\", " +  
                         "\"Latitude\": "+  $value[14] +  ", " +
                         "\"Longitude\": "+  $value[16] +  " }," ;          
                return $string;
            });


           if (written[counter] === undefined) { 

             f = fs.open("lat_long.txt", "a");
             f.write(str);
             f.close();
             written[counter] = true;
             console.log("Writing to file #"+  counter);
           }  

        });
    }

    };
}

 console.log("Start...");

yourFunction();
查看更多
Bombasti
3楼-- · 2019-04-17 08:29

If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen(). (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)

If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)

An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!

查看更多
登录 后发表回答