Parse multiple pages with phantomjs -


i have made code parses url-s page. next, href every parsed url <div class="holder"><a href="these url-s"></a></div> , output file , sepparate comma.

so far have made code. able find url-s need parsed , collects them comma sepparated file called output2.txt.

var resourcewait  = 300, maxrenderwait = 10000, url = 'url parse href-s from'; var page          = require('webpage').create(), count         = 0, forcedrendertimeout, rendertimeout; page.viewportsize = { width: 1280, height : 1024 };  function dorender() {     var fs = require('fs');     var path = 'output2.txt';     page.includejs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {         fs.write(path,page.evaluate(function() {             return $('.urldiv').find('a')             .map(function() {             return this.href;})             .get()             .join(',');         }), 'w');          phantom.exit()     }); }  page.onresourcerequested = function (req) {     count += 1;      cleartimeout(rendertimeout); };  page.onresourcereceived = function (res) {     if (!res.stage || res.stage === 'end') {         count -= 1;          if (count === 0) {             rendertimeout = settimeout(dorender, resourcewait);         }     } };  page.open(url, function (status) {     if (status !== "success") {          phantom.exit();         } else {         forcedrendertimeout = settimeout(function () {             console.log(count);             dorender();         }, maxrenderwait);     } }); 

thanks in advance,

martti


Comments

Popular posts from this blog

java - Date formats difference between yyyy-MM-dd'T'HH:mm:ss and yyyy-MM-dd'T'HH:mm:ssXXX -

c# - Get rid of xmlns attribute when adding node to existing xml -