Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
91 commits
Select commit Hold shift + click to select a range
6fdf15a
support jamie oliver scraping
shaniqwa Apr 28, 2021
ecbe2bd
fix some tests. add defaultSetDescription to BaseScraper
shaniqwa Apr 28, 2021
cda11c4
fix more tests
shaniqwa Apr 28, 2021
8669a34
rename
shaniqwa Apr 28, 2021
e451674
fix SimplyRecipes tests
shaniqwa Apr 28, 2021
a005ae9
fix yummly. update puppeteer to latest version
shaniqwa Apr 28, 2021
b39a9f6
fix thespruceeats tests
shaniqwa Apr 28, 2021
4db4c1d
theSpruceEast - add recipe description
shaniqwa Apr 28, 2021
55d8d9f
fix woolworths
shaniqwa Apr 28, 2021
27e38f9
closetCooking add description
shaniqwa Apr 28, 2021
00ba239
try to change travis node version
shaniqwa Apr 28, 2021
6b217c2
revert travis node version to stable
shaniqwa Apr 28, 2021
caa38c9
more logs
shaniqwa Apr 29, 2021
baf8a87
logs
shaniqwa Apr 29, 2021
f6ff471
log
shaniqwa Apr 29, 2021
affc05a
log
shaniqwa Apr 29, 2021
70314e3
log
shaniqwa Apr 29, 2021
f04e3a4
skip should fetch the expected recipe test if server is not responding
shaniqwa Apr 29, 2021
20583c3
add ambitiouskitchen description
shaniqwa Apr 29, 2021
4736af1
add recipetineats description
shaniqwa Apr 29, 2021
b47e379
add GimmeDelicious description
shaniqwa Apr 29, 2021
1e07117
add tastOfHome description
shaniqwa Apr 29, 2021
71e1029
add SeriousEats description
shaniqwa Apr 29, 2021
7045020
add therecipecritic description
shaniqwa Apr 29, 2021
f426a0f
add myrecipes description
shaniqwa Apr 29, 2021
7a7e1ae
bonappetit add descriptions and tags
shaniqwa Apr 29, 2021
5821206
tastesbetterfromscratch add description and tags
shaniqwa Apr 29, 2021
dd2f66a
kitchen stories - add description and tags
shaniqwa Apr 29, 2021
972b953
foodandwine add description
shaniqwa Apr 29, 2021
60c7803
NomNomPaleo add description
shaniqwa Apr 29, 2021
7d6bc4a
budgetbytes add description
shaniqwa Apr 29, 2021
077b462
melskitchencafe add description
shaniqwa Apr 29, 2021
7d6c137
melskitchencafe add tags
shaniqwa Apr 29, 2021
7ea10ed
add description
shaniqwa Apr 29, 2021
b03707a
add description
shaniqwa Apr 29, 2021
01a2b1f
add description
shaniqwa Apr 29, 2021
def84dd
add description
shaniqwa Apr 29, 2021
616e69a
add description
shaniqwa Apr 29, 2021
41dbaee
add description
shaniqwa Apr 29, 2021
cce53ee
description
shaniqwa Apr 29, 2021
0422101
allRecipes description
shaniqwa Apr 29, 2021
97a9611
add description
shaniqwa Apr 29, 2021
caf42c3
add description
shaniqwa Apr 29, 2021
b9b9b95
descriptions
shaniqwa Apr 29, 2021
9f378cb
more descriptions
shaniqwa Apr 29, 2021
74317f1
comment fix
shaniqwa Apr 29, 2021
83ec05e
remove moment js
shaniqwa May 3, 2021
8b3c584
remove log
shaniqwa May 3, 2021
d84d2e3
fix tags
shaniqwa May 3, 2021
75c85e9
Merge pull request #1 from shaniqwa/jamie-oliver
shaniqwa Jul 8, 2021
a2364ec
fix some tests. add defaultSetDescription to BaseScraper
shaniqwa Apr 28, 2021
47879bd
fix more tests
shaniqwa Apr 28, 2021
668bf36
fix woolworths
shaniqwa Apr 28, 2021
5fe3af0
try to change travis node version
shaniqwa Apr 28, 2021
d729e97
revert travis node version to stable
shaniqwa Apr 28, 2021
0234e5c
more logs
shaniqwa Apr 29, 2021
1dec3a6
logs
shaniqwa Apr 29, 2021
c33596e
log
shaniqwa Apr 29, 2021
8dd9980
skip should fetch the expected recipe test if server is not responding
shaniqwa Apr 29, 2021
d929b2b
add myrecipes description
shaniqwa Apr 29, 2021
46b0672
add description
shaniqwa Apr 29, 2021
81d3a44
comment fix
shaniqwa Apr 29, 2021
01f4700
remove moment js
shaniqwa May 3, 2021
f7ef513
fix tags
shaniqwa May 3, 2021
f465c30
rebase
shaniqwa Aug 7, 2021
2b85e8e
remove console.log
shaniqwa Aug 7, 2021
cc60ea8
delete duplicate function
shaniqwa Aug 7, 2021
192b19e
rename file- typo
shaniqwa Aug 7, 2021
9782144
fix tests
shaniqwa Aug 7, 2021
9b1669d
add ld+json default scraper. apply to bon appetit and taste of home a…
shaniqwa Aug 8, 2021
b6023ec
bug fixs
shaniqwa Aug 8, 2021
86aa661
minor fixes
shaniqwa Aug 9, 2021
e651c01
woolworths should use the ld json schema, but it's still not working …
shaniqwa Aug 9, 2021
d776a87
remove woolworths test
shaniqwa Aug 9, 2021
f4533ac
fix handling unsupported / invalid urls
shaniqwa Aug 9, 2021
fbad66d
html decode - add regex for removing specific short cods
shaniqwa Aug 15, 2021
ee6379c
fix test
shaniqwa Aug 15, 2021
daa4b0a
Merge pull request #2 from shaniqwa/JSON+LD
shaniqwa Aug 15, 2021
ba11390
add support for HowToSection. add test for a website requested in iss…
shaniqwa Aug 15, 2021
2ca3b61
Merge pull request #3 from shaniqwa/JSON+LD
shaniqwa Aug 15, 2021
00ae10b
fix typo
shaniqwa Aug 16, 2021
36be742
add logs
shaniqwa Aug 16, 2021
7485a33
Throw errors
shaniqwa Aug 16, 2021
bd18d28
update puppeteer to latest version
shaniqwa Aug 16, 2021
89e26fb
don't use puppeteer in json scraper
shaniqwa Aug 16, 2021
1a85858
remove logs, fix test
shaniqwa Aug 16, 2021
d89b6d9
fix tests
shaniqwa Aug 16, 2021
48c8413
Merge pull request #4 from shaniqwa/JSON+LD
shaniqwa Aug 21, 2021
d53f5cc
if no recipe schema was found, do not throw error. return page title,…
shaniqwa Dec 13, 2021
97afad1
Merge branch 'master' into JSON+LD
shaniqwa Dec 13, 2021
bf64b78
fix all tests
shaniqwa Dec 15, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 6 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ recipeScraper("some.recipe.url").then(recipe => {
- https://copykat.com/
- https://damndelicious.net/
- https://www.eatingwell.com/
- https://www.epicurious.com/
- https://www.food.com/
- https://www.foodandwine.com/
- https://www.foodnetwork.com/
Expand Down Expand Up @@ -78,7 +77,8 @@ recipeScraper("some.recipe.url").then(recipe => {
- https://www.yummly.com/
- https://www.jamieoliver.com/

Don't see a website you'd like to scrape? Open an [issue](https://github.com/jadkins89/Recipe-Scraper/issues) and we'll do our best to add it.
And many more! the list above is old fashioned scraping, but for all those websites who have google recipe ld json included, it will also work.


## Recipe Object

Expand All @@ -105,6 +105,8 @@ Depending on the recipe, certain fields may be left blank. All fields are repres

## Error Handling

If a recipe is not found on the given url, the basic page info will be returned: title, image & description.

If the url provided is invalid and a domain is unable to be parsed, an error message will be returned.

```javascript
Expand All @@ -114,24 +116,6 @@ recipeScraper("keyboard kitty").catch(error => {
});
```

If the url provided doesn't match a supported domain, an error message will be returned.

```javascript
recipeScraper("some.invalid.url").catch(error => {
console.log(error.message);
// => "Site not yet supported"
});
```

If a recipe is not found on a supported domain site, an error message will be returned.

```javascript
recipeScraper("some.no.recipe.url").catch(error => {
console.log(error.message);
// => "No recipe found on page"
});
```

If a page does not exist or some other 400+ error occurs when fetching, an error message will be returned.

```javascript
Expand All @@ -150,6 +134,8 @@ recipeScraper("some.improper.url").catch(error => {
});
```



## Bugs

With web scraping comes a reliance on the website being used not changing format. If this occurs we need to update our scrape. Please reach out if you are experiencing an issue.
Expand Down
242 changes: 233 additions & 9 deletions helpers/BaseScraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class BaseScraper {

return res.ok; // res.status >= 200 && res.status < 300
} catch (e) {
console.log(e)
// console.log(e)
return false;
}
}
Expand All @@ -47,6 +47,192 @@ class BaseScraper {
throw new Error("No recipe found on page");
}

/**
* look for LD+JOSN script in the web page.
* @param {object} $ - a cheerio object representing a DOM
* @returns {boolean} - if exist, set recipe data and return true, else - return false.
*/
defaultLD_JOSN($) {
const jsonLDs = Object.values($("script[type='application/ld+json']"));
let isRecipeSchemaFound = false;

jsonLDs.forEach(jsonLD => {
if (jsonLD && jsonLD.children && Array.isArray(jsonLD.children)) {
jsonLD.children.forEach(el => {
if (el.data) {

const jsonRaw = el.data;
const result = JSON.parse(jsonRaw);
let recipe;



if (result['@graph'] && Array.isArray(result['@graph'])) {
result['@graph'].forEach(g => {
if (g['@type'] === 'Recipe') {
recipe = g;
}
})
}

if (result['@type'] === 'Recipe') {
recipe = result;
}

if (Array.isArray(result['@type']) && result['@type'].includes('Recipe')) {
recipe = result;
}

if (recipe) {
// console.log('found a Recipe type json schema!');
try {
// name
this.recipe.name = BaseScraper.HtmlDecode($, recipe.name);

// description
if (recipe.description) {
this.recipe.description = BaseScraper.HtmlDecode($, recipe.description);
} else {
this.defaultSetDescription($);
}

// image
if (Array.isArray(recipe.image)) {
recipe.image = recipe.image[0];
}

if (recipe.image) {
if (recipe.image["@type"] === "ImageObject" && recipe.image.url) {
this.recipe.image = recipe.image.url;
} else if (typeof recipe.image === "string") {
this.recipe.image = recipe.image;
}
} else {
this.defaultSetImage($);
}


// tags
this.recipe.tags = [];
if (recipe.keywords) {
if (typeof recipe.keywords === "string") {
this.recipe.tags = [...recipe.keywords.split(',')]
} else if (Array.isArray(recipe.keywords)) {
this.recipe.tags = [...recipe.keywords]
}
}

if (recipe.recipeCuisine) {
if (typeof recipe.recipeCuisine === "string") {
this.recipe.tags.push(recipe.recipeCuisine)
} else if (Array.isArray(recipe.recipeCuisine)) {
this.recipe.tags = [...new Set([...this.recipe.tags, ...recipe.recipeCuisine])]
}
}

if (recipe.recipeCategory) {
if (typeof recipe.recipeCategory === "string") {
this.recipe.tags.push(recipe.recipeCategory)
} else if (Array.isArray(recipe.recipeCategory)) {
this.recipe.tags = [...new Set([...this.recipe.tags, ...recipe.recipeCategory])]
}
}

this.recipe.tags = this.recipe.tags.map(i => BaseScraper.HtmlDecode($, i));
this.recipe.tags = [...new Set(this.recipe.tags)];

// ingredients
if (Array.isArray(recipe.recipeIngredient)) {
this.recipe.ingredients = recipe.recipeIngredient.map(i => BaseScraper.HtmlDecode($, i));
} else if (typeof recipe.recipeIngredient === "string") {
this.recipe.ingredients = recipe.recipeIngredient.split(",").map(i => BaseScraper.HtmlDecode($, i.trim()));
}

// instructions (may be string, array of strings, or object of sectioned instructions)
this.recipe.instructions = [];
this.recipe.sectionedInstructions = [];

if (recipe.recipeInstructions &&
recipe.recipeInstructions["@type"] === "ItemList" &&
recipe.recipeInstructions.itemListElement) {

recipe.recipeInstructions.itemListElement.forEach(section => {
this.recipe.instructions = [
...this.recipe.instructions,
...section.itemListElement.map(i => BaseScraper.HtmlDecode($, i.text))
];
section.itemListElement.forEach(i => {
this.recipe.sectionedInstructions.push({
sectionTitle: section.name,
text: BaseScraper.HtmlDecode($, i.text),
image: i.image || ''
})
});
});
} else if (Array.isArray(recipe.recipeInstructions)) {
recipe.recipeInstructions.forEach(instructionStep => {
if (instructionStep["@type"] === "HowToStep") {
this.recipe.instructions.push(BaseScraper.HtmlDecode($, instructionStep.text));
this.recipe.sectionedInstructions.push({
sectionTitle: instructionStep.name || '',
text: BaseScraper.HtmlDecode($, instructionStep.text),
image: instructionStep.image || ''
})
} else if (instructionStep["@type"] === "HowToSection") {
if (instructionStep.itemListElement) {
instructionStep.itemListElement.forEach(step => {
this.recipe.instructions.push(BaseScraper.HtmlDecode($, step.text));

this.recipe.sectionedInstructions.push({
sectionTitle: instructionStep.name,
text: BaseScraper.HtmlDecode($, step.text),
image: step.image || ''
})
});
}
} else if (typeof instructionStep === "string") {
this.recipe.instructions.push(BaseScraper.HtmlDecode($, instructionStep));
}
});
} else if (typeof recipe.recipeInstructions === "string") {
this.recipe.instructions = [BaseScraper.HtmlDecode($, recipe.recipeInstructions)]
}

// prep time
if (recipe.prepTime) {
this.recipe.time.prep = BaseScraper.parsePTTime(recipe.prepTime);
}

// cook time
if (recipe.cookTime) {
this.recipe.time.cook = BaseScraper.parsePTTime(recipe.cookTime);
}

// total time
if (recipe.totalTime) {
this.recipe.time.total = BaseScraper.parsePTTime(recipe.totalTime);
}

// servings
if (Array.isArray(recipe.recipeYield)) {
this.recipe.servings = recipe.recipeYield[0];
} else if (typeof recipe.recipeYield === "string") {
this.recipe.servings = recipe.recipeYield;
}

isRecipeSchemaFound = true;
} catch (e) {
console.log(e);
}
}
}
});
}
});

return isRecipeSchemaFound;
}

/**
* @param {object} $ - a cheerio object representing a DOM
* @returns {string|null} - if found, an image url
Expand All @@ -58,6 +244,21 @@ class BaseScraper {
$("meta[itemprop='image']").attr("content");
}

/**
* @param {object} $ - a cheerio object representing a DOM
* if found, set recipe name
*/
defaultSetName($) {
let title =
$("meta[name='title']").attr("content") ||
$("meta[property='og:title']").attr("content") ||
$("meta[name='twitter:title']").attr("content");

title = title.split('|')[0];

this.recipe.name = title ? title.trim() : '';
}

/**
* @param {object} $ - a cheerio object representing a DOM
* if found, set recipe description
Expand All @@ -77,11 +278,16 @@ class BaseScraper {
*/
async fetchDOMModel() {
try {
const res = await fetch(this.url);
const meta = [
['User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'],
];
const headers = new fetch.Headers(meta);
const res = await fetch(this.url, {headers});
const html = await res.text();
return cheerio.load(html);
} catch (err) {
this.defaultError();
throw err;
// this.defaultError();
}
}

Expand All @@ -91,9 +297,15 @@ class BaseScraper {
*/
async fetchRecipe() {
this.checkUrl();
const $ = await this.fetchDOMModel();
this.createRecipeObject();
this.scrape($);
try {
const $ = await this.fetchDOMModel();
this.createRecipeObject();
this.scrape($);
} catch (e) {
// throw e;
this.defaultError();
}

return this.validateRecipe();
}

Expand All @@ -110,25 +322,37 @@ class BaseScraper {
return el.text().trim();
}

static HtmlDecode($, s) {
const res = $('<div>').html(s).text() || "";

return res.trim()
.replace(/amp;/gm, '')
.replace(/(?=\[caption).*?(?<=\[ caption\])/g, '') // removes short-codes [caption.*[ caption]
.replace(/\n/g, "");
}

/**
* Validates scraped recipes against defined recipe schema
* @returns {object} - an object representing the recipe
*/
validateRecipe() {
let res = validate(this.recipe, recipeSchema);
if (!res.valid) {
// res.errors.forEach(error => {
// console.log(error.property + ' ' + error.message);
// });
this.defaultError();
}
return this.recipe;
}

static parsePTTime(ptTime) {
ptTime = ptTime.replace('PT', '');
ptTime = ptTime.replace('H', ' hours');
ptTime = ptTime.replace('M', ' minutes');
ptTime = ptTime.replace('H', ' hours ');
ptTime = ptTime.replace('M', ' minutes ');
ptTime = ptTime.replace('S', ' seconds');

return ptTime;
return ptTime.trim();
}
}

Expand Down
31 changes: 31 additions & 0 deletions helpers/DefaultLdJsonScraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
const BaseScraper = require("./BaseScraper");

class DefaultLdJsonScraper extends BaseScraper {

// async customPoll(page) {
// let container,
// count = 0;
// do {
// container = await page.$("script[type='application/ld+json']");
// if (!container) {
// await page.waitForTimeout(100);
// count++;
// }
// } while (!container && count < 60);
// return true;
// }

scrape($) {
const isSchemaFound = this.defaultLD_JOSN($);

if (!isSchemaFound) {
// throw new Error("Site not yet supported");
// if no recipe schema was found, return the basic page info
this.defaultSetName($);
this.defaultSetDescription($);
this.defaultSetImage($);
}
}
}

module.exports = DefaultLdJsonScraper;
Loading