-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebscraper.cpp
More file actions
109 lines (96 loc) · 3.2 KB
/
webscraper.cpp
File metadata and controls
109 lines (96 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include<iostream>
#include<string>
#include<algorithm>
#include<curl/curl.h>
#include<libxml/HTMLparser.h>
#include<libxml/xpath.h>
#include <cctype>
using namespace std;
string strtolower(string str){
transform(str.begin(), str.end(), str.begin(),::tolower);
return str;
}
string scrape(string markup){
string res;
htmlDocPtr doc = htmlReadMemory(
markup.data(), markup.length(),
NULL, NULL,
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING
);
if (!doc) return "HTML parse failed";
xmlXPathContextPtr context = xmlXPathNewContext(doc);
if (!context) {
xmlFreeDoc(doc);
return "XPath context error";
}
xmlXPathObjectPtr nodes = xmlXPathEvalExpression(
(xmlChar*)"//span[contains(@class,'dtText')]",
context
);
if (!nodes || !nodes->nodesetval) {
xmlXPathFreeContext(context);
xmlFreeDoc(doc);
return "No definitions available";
}
for (int i = 0; i < nodes->nodesetval->nodeNr; ++i) {
char* text = (char*)xmlNodeGetContent(nodes->nodesetval->nodeTab[i]);
if (text) {
res += text;
res += "\n";
xmlFree(text);
}
}
xmlXPathFreeObject(nodes);
xmlXPathFreeContext(context);
xmlFreeDoc(doc);
return res;
}
string request(string word){
CURLcode res_code=CURLE_FAILED_INIT;
CURL *curl=curl_easy_init();
string result;
string url="https://www.merriam-webster.com/dictionary/"+strtolower(word);
curl_easy_setopt(curl,CURLOPT_WRITEDATA,&result);
curl_easy_setopt(curl,CURLOPT_URL, url.c_str());
curl_easy_setopt(curl,CURLOPT_USERAGENT,"simple scraper");
if(curl){
curl_easy_setopt(curl,
CURLOPT_WRITEFUNCTION,
+[](char* contents, size_t size,size_t nmemb, string* data)->size_t
{
size_t new_size=size*nmemb;
if(data==NULL) return 0;
data->append(contents,new_size);
return new_size;
}
);
res_code=curl_easy_perform(curl);
if(res_code!=CURLE_OK){
return curl_easy_strerror(res_code);
}
long http_code = 0;
// Get the HTTP status code (200, 404, 500, etc.)
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
if(http_code == 404) {
curl_easy_cleanup(curl);
return "Error: Word not found (404).";
} else if (http_code != 200) {
curl_easy_cleanup(curl);
return "Error: Server returned status " + to_string(http_code);
}
curl_easy_cleanup(curl);
}
return result;
}
int main(int argc, char** argv){
if(argc!=2){
cout<<"Please provide a valid english word"<<'\n';
exit(EXIT_FAILURE);
}
curl_global_init(CURL_GLOBAL_ALL);
string arg=argv[1];
string res=request(arg);
cout<<scrape(res)<<'\n';
curl_global_cleanup();
return 0;
}