forked from berthubert/tkconv
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtkgetxml.cc
More file actions
113 lines (104 loc) · 4.05 KB
/
tkgetxml.cc
File metadata and controls
113 lines (104 loc) · 4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <fmt/format.h>
#include <fmt/printf.h>
#include <iostream>
#include "httplib.h"
#include "sqlwriter.hh"
#include "pugixml.hpp"
using namespace std;
int main(int argc, char** argv)
{
vector<string> categories=
{"Activiteit", "ActiviteitActor", "Agendapunt", "Besluit", "Commissie",
"CommissieContactinformatie", "CommissieZetel", "CommissieZetelVastPersoon",
"CommissieZetelVastVacature", "CommissieZetelVervangerPersoon",
"CommissieZetelVervangerVacature", "Document", "DocumentActor",
"DocumentVersie", "Fractie", "FractieZetel", "FractieZetelPersoon",
"FractieZetelVacature", "Kamerstukdossier", "Persoon",
"PersoonContactinformatie", "PersoonGeschenk", "PersoonLoopbaan",
"PersoonNevenfunctie", "PersoonNevenfunctieInkomsten", "PersoonOnderwijs",
"PersoonReis", "Reservering", "Stemming", "Toezegging", "Vergadering",
"Verslag", "Zaak", "ZaakActor", "Zaal"};
signal(SIGPIPE, SIG_IGN); // every TCP application needs this
if(argc > 1) {
categories.clear();
for(int n = 1 ; n < argc; ++n)
categories.push_back(argv[n]);
}
SQLiteWriter sqlw("xml.sqlite3");
for(const auto& category: categories) {
sqlw.query("create table if not exists "+category+" (skiptoken INT)");
sqlw.query("create index if not exists "+category+"skipidx on "+category+"(skiptoken)");
string next="https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed?category=" + category;
int skiptoken = -1;
try {
auto ret = sqlw.queryT("select skiptoken from "+category+" order by rowid desc limit 1");
if(!ret.empty()) {
skiptoken = get<int64_t>(ret[0]["skiptoken"]);
next = fmt::format("https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed?skiptoken={}&category={}", skiptoken, category);
}
else {
fmt::print("Could not get a skiptoken for category {}. First run?\n", category);
}
}
catch(std::exception& e) {
fmt::print("Could not get a 'next' from database for category {}, starting from scratch\n", category);
}
int catentries=0;
while(!next.empty()) {
int entries = 0;
httplib::Client cli("https://gegevensmagazijn.tweedekamer.nl");
cli.set_connection_timeout(10, 0);
cli.set_read_timeout(10, 0);
cli.set_write_timeout(10, 0);
fmt::print("Retrieving from {}.. ", next);
cout.flush();
auto res = cli.Get(next);
if(!res) {
auto err = res.error();
throw runtime_error("Oops retrieving from "+next+" -> "+httplib::to_string(err));
}
next.clear();
pugi::xml_document doc;
if (!doc.load_string(res->body.c_str())) {
cout<<"Could not load XML"<<endl;
return -1;
}
auto feed = doc.child("feed");
if(!feed) {
cout<<"No feed in XML at "<<next<<"\n";
return -1;
}
for(const auto& node : feed.children("entry")) {
string id = node.child("title").child_value();
string updated = node.child("updated").child_value();
string enclosure;
entries++;
catentries++;
for (auto link : node.children("link")) {
if(link.attribute("rel").value() == string("enclosure")) {
enclosure = link.attribute("href").value();
}
else if(link.attribute("rel").value() == string("next")) {
if(auto href = link.attribute("href")) {
next = href.value();
// https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed?skiptoken=20127222&category=Document
if(next.find("https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed"))
throw std::runtime_error("Unexpected next URL format "+next);
if(auto pos = next.find("skiptoken="); pos ==string::npos)
throw std::runtime_error("Could not find skiptoken in "+next);
else {
skiptoken = atoi(next.substr(pos+10).c_str());
}
}
}
}
ostringstream xml;
node.print(xml, "\t", pugi::format_raw);
sqlw.addValue({{"category", category},{"id", id}, {"skiptoken", skiptoken}, {"enclosure", enclosure}, {"updated", updated}, {"xml", xml.str()}}, category);
}
fmt::print("got {} entries\n", entries);
usleep(100000);
}
cout<<"Done - saw "<<catentries<<" new entries for category"<<endl;
}
}