PhantomJS: fuente de datos de extracción: parte del contenido de AJAX no se carga, incluso con un generoso setTimeout

Tengo una configuración genérica de PhantomJS para raspar fuentes (estamos haciendo esto con el permiso del propietario, para un cliente): le das una URL, un código jQuery/javascript para pasar la página y un selector para seleccionar enlaces de la fuente.

Este feed parece estar cargando todo bien, excepto los botones para pasar la página [ver imágenes].

Representado por PhantomJS:

Foto de PhantomJS con botones de 'cambio de página' faltantes

Renderizado por Chrome en mi computadora:

enter image description here

He estado perplejo por más de un día.

Cualquier ayuda muy apreciada.

Mi código:

var page = new WebPage({
                  settings: {
                    loadPlugins: true,
                    userAgent : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5",
                    XSSAuditingEnabled: false,
                    webSecurityEnabled: false
                  },
                  viewportSize: { width: 1366, height: 768 }
               }),
    output_phantom = {errors: [], results: null};

var turn_page_jquery    = "$('#yui-pg0-0-next-link').click_link()",
    url_selector_jquery = "$('h3 a').multiAttr('href')",
    url                 = "http://www.springcjd.com/new/search?dpt=2#QryString=%3FlogSearch%3Dfalse%26sortCol%3Dnull%26sortType%3Dnull%26VIN%3Dnull%26dealerStockID%3Dnull%26stockType%3D2%26year%3Dnull%26make%3Dnull%26model%3Dnull%26subModel%3Dnull%26body%3Dnull%26minMileage%3Dnull%26maxMileage%3Dnull%26numOfDoors%3Dnull%26certified%3Dnull%26minDFList%3Dnull%26maxDFList%3Dnull%26onLotAfter%3Dnull%26onLotBefore%3Dnull%26pageNum%3D1%26carsPerPage%3D30%26fullText%3Dnull%26%26mpghmn%3Dnull%26%26lotID%3Dnull%26daysOnLotMin%3Dnull%26%26output%3Djson%22";

// Start things going here
get_links_from_pages(url, url_selector_jquery, turn_page_jquery);

// Allows you to pass args to function in page
function evaluate(page, func) {
    var args = [].slice.call(arguments, 2);
    var fn   = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
    return page.evaluate(fn);
}

page.onError = function (msg, trace) {
    console.log(msg);
    trace.forEach(function(item) {
        console.log('  ', item.file, ':', item.line);
    });
};

// Communicator between phantomJS and the page.
page.onConsoleMessage = function(msg) 
{
    var msg_json = JSON.parse(msg);

    if(msg_json && msg_json.type)
    {
        var type    = msg_json.type;
        var message = msg_json.message;

        switch(type)
        {
            case 'return_value':
                output_phantom.results = message;
                console.log(JSON.stringify(output_phantom)); 
                phantom.exit(); 
                break;
            case 'message':
                output_phantom.errors.push(message);
                break;
            case 'exit': 
                phantom.exit();
                break;
            case 'render': 
                var photo_name = 'phantom_test.png';
                if(message != '') 
                    photo_name = message;
                page.render(photo_name);
                break;
        }
    }
    else
       output_phantom.errors.push(msg);
};

function inject_scripts()
{
    // Inject jquery and our additional script if they don't exist
    // Would like to be able to overwrite an older version of jQuery
    if(page.evaluate(function () { return typeof jQuery;}) == 'undefined')
    {
        if(page.evaluate(function () { return typeof $;}) == 'function') {
            console.log("'$' symbol already used.");
        }
        else
        {
            if (!page.injectJs("/../../jquery.js")) {
                console.log("jQuery not loaded...");
                phantom.exit();                
            }
        }
    } 

    // Inject scripts that allow communication using fn onConsoleMessage
    if (!page.injectJs("/../lf_additional.js")) {
        console.log("Additional scripts not loaded...");
        phantom.exit();                
    }
}

function run_phantom(url, fn/* args here, just not seen */)
{
    var extra_args = [].slice.call(arguments, 2);

    page.open(url, function (status) {
        // if (status !== 'success') {
        //     console.log('Unable to load the url! (URL: '+url+')');
        //     phantom.exit();
        // }
        // else 
        // {
            inject_scripts();
            output_phantom = {errors: [], results: null}; 

            // run our js code inside the headless browser.
            extra_args.unshift(page, fn);
            evaluate.apply(this, extra_args);
        // }
    });
}

function get_links_from_pages(url, url_selector_jquery, turn_page_jquery)
{
    var fn = function(url_selector_jquery, turn_page_jquery) 
             {
                var results = [];
                var i = 0; 

                var interval = setInterval(function() 
                    {
                        // Photograph page right before selecting values
                        phantom_render('ph_scjd_'+i+'.png');

                        var selected_data = eval(url_selector_jquery);

                        //
                        results.push(selected_data);

                        // Try to turn the page
                        eval(turn_page_jquery);

                        // Get the first 4 pages
                        if(i >= 3) {
                            phantom_return(results);
                            clearInterval(interval);
                        }
                        i++;

                    }, 3000);
             };
    return run_phantom(url, fn, url_selector_jquery, turn_page_jquery);
}

//////////////////////////////////////////////////////////
// Other stuff

// Improved json parsing
(function() {

    var parse = JSON.parse;

    JSON = {

        stringify: JSON.stringify,

        validate: function(str) {

            try {
                parse(str);
                return true;
            } catch(err){
                return err;
            }
        },

        parse: function(str) {

            try {
                return parse(str);
            } catch(err){
                return undefined;
            }
        }
    }
})();

El archivo inyectado 'additional.js':

/* 
 *  These are additional scripts intended to make selection of multiple elements
 *  much more concise.
 */

$.fn.click_link = function() {
     simulateMouseClick(this.selector);
};

$.fn.collect = function(fn) {
    var values = [];

    if (typeof fn == 'string') {
        var prop = fn;
        fn = function() { return this.attr(prop); };
    }

    $(this).each(function() {
        var val = fn.call($(this));
        values.push(val);
    });
    return values;
};

$.fn.multiAttr = function(attrName) {
    return this.collect(attrName);
};

// .text() should be pretty close, except concatenated?
$.fn.multiHtml = function() {
    var val_array = this.collect(function() { return this.html(); });
    return val_array;
};

$.fn.multiVal = function() {
    return this.multiAttr('value');
};

// The commented out code is much more concise, but probably less efficient
// $(arr1).not(arr2).length == 0 && $(arr2).not(arr1).length == 0
jQuery.extend({
    compareArray: function (arrayA, arrayB) {
        if (arrayA.length != arrayB.length) { return false; }
        // sort modifies original array
        // (which are passed by reference to our method!)
        // so clone the arrays before sorting
        var a = jQuery.extend(true, [], arrayA);
        var b = jQuery.extend(true, [], arrayB);
        a.sort(); 
        b.sort();
        for (var i = 0, l = a.length; i < l; i++) {
            if (a[i] !== b[i]) { 
                return false;
            }
        }
        return true;
    }
});

function simulateMouseClick(selector) { 
    var targets = document.querySelectorAll(selector), 
        evt = document.createEvent('MouseEvents'), 
        i, len; 
    evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null); 

    for ( i = 0, len = targets.length; i < len; ++i ) { 
        targets[i].dispatchEvent(evt);     
    }
}

function send_console_command(type, message)
{
    var msg = {};

    if(!message) message = '';

    msg.message    = message;
    msg.type       = type;
    msg.validation = 'phantom_js_communicator';

    console.log(JSON.stringify(msg));
}

function phantom_exit() {
    send_console_command('exit');
}

function phantom_message(msg) {
    send_console_command('message', msg);
}

function phantom_return(return_val) {
    send_console_command('return_value', return_val);
}

function phantom_render(photo_name) {
    send_console_command('render', photo_name);
}

preguntado el 12 de junio de 12 a las 17:06

1 Respuestas

No es:

var page = require('webpage').create();
page.viewportSize = { width: 1366, height: 768 };

No probado, solo leyendo los documentos.

Respondido el 13 de junio de 12 a las 15:06

Gracias por su respuesta. Lo probé, y parece que ambas formas establecerán el tamaño de la ventana gráfica de la misma manera. - Nathan Lippi

No es la respuesta que estás buscando? Examinar otras preguntas etiquetadas or haz tu propia pregunta.