const Crawler = require('crawler');
const $ = require('cheerio');
const debug = require('debug')('amazonScraper:crawler');
/**
* Método que retorna a uri para o scraper
* @param {String} asin
*/
const URI = (asin) => `https://www.amazon.com/gp/video/detail/${asin}`;
/**
* Classe Scraper
*
* Classe com os métodos scrapMovieById e scrapShowById
* que procura filmes ou séries pelo ASIN e retorna os dados
* do filme ou série informados
*/
class Scraper {
/**
* Realiza o scrap da página
* @param {String} asin
*/
scrapMovieById(asin) {
const self = this;
return new Promise(function(resolve, reject) {
const crawler = new Crawler({
rateLimit: 500,
retries: 3,
});
let movie;
crawler.direct({
uri: URI(asin),
callback: function(error, res) {
if (error) {
reject(error);
}
const $ = res.$;
movie = self.parseMovieData($);
movie.program.asin = asin;
debug('Movie dentro do Crawler:scrapById: %O', movie);
resolve(movie);
}
});
});
};
parseMovieData($) {
const description = $('div[data-automation-id="synopsis"]').text();
const releaseYear = $('span[data-automation-id="release-year-badge"]').text();
const genres = $('dt[data-automation-id="meta-info-genres"]').next().children('a')
.map(function() {
return $.trim($(this).text());
});
const title = $('h1[data-automation-id="title"]').text();
const images = $('div.av-fallback-packshot > img')
.map(function() {
return $.trim($(this).attr("src"));
});
const keywords = $('meta[name="keywords"]').attr("content");
const cast = $( 'th:contains("Starring"), th:contains("Supporting actors")' ).next().text().split(',')
const duration = parseInt($('div.av-badges > span').eq(2).text().split(' ')[0]);
const movie = {
program: {
description,
releaseYear,
genres,
title,
images,
keywords,
cast,
duration,
},
};
debug('Movie em crawler:parsevideodata: %O', movie);
return movie;
};
}
module.exports = Scraper;
var Crawler = require("crawler");
var fs = require("fs");