Fast, flexible, and lean implementation of core jQuery designed specifically for the server.
Hello Everyone, I am trying to use cheerio in a nodejs application and trying to parse a document to get a structure that I can use to index documents with algolia. I am allowing the developer consuming this to pass in a structure such as h1 h3 p
, or h1 h2 h3 p
but this will be dynamic. I am then trying to get the elements and create json objects which will be passed to algolia for indexing. Below is a sample html structure and how the json objects are created
<h1>Article Title</h1>
<h3>Section Title</h3>
<p>Content</p>
<h3>Section Two</h3>
<p>Content 2</p>
<h3>Section Three</h3>
<p>Content 3</p>
<h1>Secondary Title</h1>
<p>Secondary Content</p>
so for the options.structure I would pass in h1 h3 p, this would then create the following structures to pass to algolia
{
"link":"path/article#article-title",
"importance":0,
"objectID":"path/article#article-title-816d21744be4034a4bfa3323722024b4"
"h1":"Article Title",
}
{
"link":"path/article#section-title",
"importance":3,
"objectID":"path/article#section-title-816d21744be4034a4bfa3323722024b4",
"h1":"Article Title",
"h3":"Section Title"
}
{
"link":"path/article#section-title-p0",
"importance":5,
"objectID":"path/article#section-title-p0-ae5e3efcdddc387616bbc5b1e5b1b134",
"h1":"Article Title",
"h3":"Section Title",
"p":"Content"
}
{
"link":"path/article#section-two”,
"importance":3,
"objectID":"path/article#section-two-816d21744be4034a4bfa3323722024b4",
"h1":"Article Title",
"h3":"Section Two”
}
{
"link":"path/article#section-two-p0",
"importance":5,
"objectID":"path/article#section-two-p0-ae5e3efcdddc387616bbc5b1e5b1b134",
"h1":"Article Title",
"h3":"Section Two”,
"p":"Content 2”
}
{
"link":"path/article#section-three”,
"importance":3,
"objectID":"path/article#section-three-816d21744be4034a4bfa3323722024b4",
"h1":"Article Title",
"h3":"Section Three”
}
{
"link":"path/article#section-three-p0",
"importance":5,
"objectID":"path/article-title#section-three-p0-ae5e3efcdddc387616bbc5b1e5b1b134",
"h1":"Article Title",
"h3":"Section Three”,
"p":"Content 3”
}
{
"link":"path/article#secondary-title",
"importance":0,
"objectID":"path/article#secondary-title-816d21744be4034a4bfa3323722024b4"
"h1”:”Secondary Title",
}
{
"link":"path/article#secondary-title-p0”,
"importance”:4,
"objectID":"path/article#secondary-title-p0-816d21744be4034a4bfa3323722024b4"
"h1”:”Secondary Title",
"p”:”Secondary Content”,
}
Now this structure would have to be dynamic meaning rather than the above they could pass in h1 h2 h3 h4 p
and it should be parsed accordingly.
There could also be content under h1 such as <h1>Title</h1><p>Content</p>
and this would receive a different importance as seen in the last example above.
The trouble I am having is how can I select all paragraphs under each tag in the structure. I know I can get all p tags under h1 and that will return them all but it will not let me know which h1 the p’s belong to, etc.
I am following this article trying to replicate this using cheerio.
https://blog.algolia.com/how-to-build-a-helpful-search-for-technical-documentation-the-laravel-example/
Hello guys, I've just started using CheerioJS and would like to know how to scrape a website. $("#bproducts div.p div.pname")
will return an object with all the product names, but I want to get the div.pprice
next to the div.pname
.
I was trying to make two arrays like:
const pName = $("#bproducts div.p div.pname");
const pPrice = $("#bproducts div.p div.pname");
and then glue it together into one object as key:value
.
What I want is to get a key:value
pair like pname:pprice
hi there, dear community, I have a question regarding some scrapping issues I'm having, I'm trying to get an 'a tag' text and each time I use $(element).text() I receive an empty string, looking at the html I can see everything there but there are also angular elements. my a tag scrapping result looks like this:
{ '0':
{ type: 'tag',
name: 'a',
namespace: 'http://www.w3.org/1999/xhtml',
attribs:
{ 'ng-href': '{{treatPathProduct(product.Characteristics[\'Path\'])}}',
'ng-bind-html': 'product.Characteristics[\'##ProductLabel\']' },
'x-attribsNamespace': { 'ng-href': undefined, 'ng-bind-html': undefined },
'x-attribsPrefix': { 'ng-href': undefined, 'ng-bind-html': undefined },
children: [],
parent:
{ type: 'tag',
name: 'h2',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: [Object],
prev: [Object],
next: [Object] },
prev: null,
next: null },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1 }
My question is, how can I get the text and the href attribute? both are empty or seems empty
I am trying
$("iframe").each(function (_i, link) {
const data = cheerio.html(link)
})
// output : <iframe id="iframe"></iframe>
It gives me the data of the tag only. But I want the whole data in html format which is present inside the iframe tag.
eg : <html><body><h1>hello world</h1></body></html>
var cheerio = require('cheerio');
const $ = cheerio.load('<ul class="cards-wrapper">...</ul>');
$.html();
pm.test('Test name', function () {
$('.cards-wrapper').text("12345");
});