Scraping data from clicked links on page then moving to next page & repeating in CasperJS

I'm struggling to get a casperjs to move on to the next page after it has recursively worked through the links on the page.

I can get it to take data from each page and move through the pages, or click on each link on a page, but I can't get it doing both.

var utils = require('utils');
var x = require('casper').selectXPath;

var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36     (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});

var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];


var terminate = function() {
    this.echo("Exiting..").exit();
};

function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}

function getPageLinks () {
  var links = document.querySelectorAll('h3.r a');
  return [].map.call(links, function(link) {
    return link.getAttribute('href');
  });
}


function getLinkData(link) {
  this.thenOpen(link, function() {


    var title = this.getTitle();


    // Add the data from link
    var data = {
      title: title,
    };
    link_titles.push(data);

  });
}

function loopThroughLinks() {  

 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.run(loopThroughLinks);

 }  else {
    utils.dump(link_titles);
    }       
}


function linkData(){
    links = this.evaluate(getPageLinks);
    this.run(loopThroughLinks);
}


var processPage = function() {  

    this.run(linkData);

    //PROBLEM EXISTS BELOW HERE - IF YOU COMMENT OUT FROM HERE IT RUNS AS EXPECTED FOR THE FIRST PAGE
    //WITH CODE BELOW INCLUDED, SKIPS this.run(linkData) AND JUST GOES THROUGH PAGES;
    this.then(function(){

    if (currentPage >= 3) {
        return terminate.call(casper);
    }

    currentPage++;

    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");

    this.thenClick('a.pn span').then(function(){
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);

        }, processPage, terminate);
     }); 
  });   //COMMENT OUT TO HERE FOR WORKING ONE PAGE VERSION
}


casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');

casper.run(processPage);

Updated code to reflect multiple run calls. Now looping through first page corrrectly, but printing results from first page for all other pages??

var utils = require('utils');
var x = require('casper').selectXPath;

var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});

var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];


var terminate = function() {
    this.echo("Exiting..").exit();
};

function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}

function getPageLinks() {
    var links = document.querySelectorAll("h3.r a");
    return Array.prototype.map.call(links, function(e) {
        try {
            // google handles redirects hrefs to some script of theirs
            return (/url\?q=(.*)&sa=U/).exec(e.getAttribute("href"))[1];
        } catch (err) {
            return e.getAttribute("href");
        }
    });
} 

function getLinkData(link) {
  this.thenOpen(link, function() {

    //var title = this.fetchText('title');
    var title = this.getTitle();


    // Add the staff data from link
    var data = {
      title: title,
    };
    link_titles.push(data);

    this.then(function(){  ///ADDED - BACK TO RIGHT PAGE FOR SELECTOR
    this.back();
    });

  });
}

function loopThroughLinks() {  

 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.then(loopThroughLinks);

 }  else {
    utils.dump(link_titles);
    }       
}


function linkData(){
    links = this.evaluate(getPageLinks);
    this.then(loopThroughLinks);
}


var processPage = function() {  

    this.wait(2000, function(){
    this.then(linkData);
    });


    this.wait(2000, function(){
    this.then(function(){

    if (currentPage >= 3) {
        return terminate.call(casper);
    }


    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");


    currentPage++;


    this.thenClick('a.pn span').then(function(){
        this.capture('google-results-2-p' + currentPage + '.png');
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);

        }, processPage, terminate);
     }); 
  });
 });
}


casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');

casper.then(processPage);

casper.run();

Answers


You have to have only one casper.run() (and only one casper.start()) call. run() starts the CasperJS step queue and will finish execution if there are no further steps. The only call that needs to stay is casper.run(processPage);, but all other this.run(...) calls need to be changed to this.then(...).


Need Your Help

Dimensionality reduction in HOG feature vector

matlab machine-learning computer-vision pca

I found out the HOG feature vector of the following image in MATLAB.

How to apportion between BatchInserterIndex cache and MMIO?

memory configuration lucene neo4j memory-mapped-files

In a batch insertion using lucene indexes, given a large set of nodes and relations such that the node and relationship store cannot fit completely in mapped memory (hence the need for lucene index