PhantomJS Node - page.open - cannot keep track of

2020-04-21 08:01发布

问题:

I'm using Phantom Node to interface node with PhantomJS. I'm trying to open pages in parallel, but the issue is that page.open callback function does not pass back the reference to the page, so I don't have a way to know which page has completed.

Relevant Code

self.queue[j].page.open.call( self.queue[j].page, rows[i].url, function( status )
{
   console.log( this ) // <-- returns undefined
   // So how do I keep track of which pages have finished loading?
   // The only variable I have available here is `status`
});

Full Function Code:

SnapEngine.prototype.processSnaps = function( rows, type )
{
var self = this;

if ( ! rows || rows.length === 0 ) return true;

for( var i = 0; i < rows.length; i++ )
{
    // If queue is full, stop processing and wait for next snap engine iteration
    if ( self.getAvailableSizeInQueue() <= 0 )
    {
        self.logger.info( 'Queue is full for signature snap processing' );
        return true;
    }

    // Snapshots are processed by url, if multiple duplication urls are requested, all are updated after one of them is complete
    // So if a url is already being processed, don't reprocess it
    if ( self.findUrlInQueue( rows[i].url ) !== false )
    {
        self.logger.info( 'URL already being processed', url );
        continue;
    }

    for( j = 0; j < self.queue.length; j++ )
    {
        // Find an unused page object
        if ( self.queue[j] && self.queue[j].hasOwnProperty( 'page' ) && ( ! self.queue[j].page.url || self.queue[j].page.url == '' ))
        {
            self.logger.info( 'Opening URL in browser', rows[i].url );

            // Start loading page
            self.queue[j].page.open.call( self.queue[j].page, rows[i].url, function( status )
            {
                // ===== ISSUE HERE =====
                var url = this.url; // <-- this is undefined
                // ======================

                self.resetPage( self.queue[ index ]);

                if ( status === 'success' )
                {
                    self.updateStatus( url, 'ready' );
                }
                else
                {
                    self.updateStatus( url, 'failed' );
                }

                self.removeUrlFromQueue( url )
            });

            self.updateStatus( rows[i].url, 'processing' );
            break;
        }
    }
}
}

回答1:

Try it like this:

I added a function that is directly executed around the part that opens the page, thus introducing a new scope. Therefore url won't get mangled (you cannot use rows[i].url as i will change before your callback is called) and will be available in your callback.

for( j = 0; j < self.queue.length; j++ )
{
    // Find an unused page object
    if ( self.queue[j] && self.queue[j].hasOwnProperty( 'page' ) && ( ! self.queue[j].page.url || self.queue[j].page.url == '' ))
    {
        self.logger.info( 'Opening URL in browser', rows[i].url );
        (function() {
            var url = rows[i].url;
            // Start loading page
            self.queue[j].page.open.call( self.queue[j].page, url, function( status )
            {                   
                self.resetPage( self.queue[ index ]);

                if ( status === 'success' )
                {
                    self.updateStatus( url, 'ready' );
                }
                else
                {
                    self.updateStatus( url, 'failed' );
                }

                self.removeUrlFromQueue( url )
            });
        })();

        self.updateStatus( rows[i].url, 'processing' );
        break;
    }
}