Arachne 1.0
Arachne - the perpetual stitcher of Wikidata entities.
Loading...
Searching...
No Matches
pheidippides.cpp
Go to the documentation of this file.
1/*
2 * The MIT License (MIT)
3 *
4 * Copyright (c) 2025 Yaroslav Riabtsev <yaroslav.riabtsev@rwth-aachen.de>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "pheidippides.hpp"
26#include "arachne.hpp"
27
28namespace arachnespace {
30 const std::unordered_set<std::string>& batch,
31 const corespace::entity_kind kind
32) {
33 if (batch.empty()) {
34 return nlohmann::json::object();
35 }
36 std::string url
38 ? "https://www.wikidata.org/w/api.php"
39 : "https://commons.wikimedia.org/w/api.php");
40 std::string props
41 = (kind != corespace::entity_kind::entity_schema ? join_str(opt.props)
42 : join_str(opt.prop));
43
44 corespace::parameter_list base_params { opt.params };
46 base_params.emplace_back("action", "query");
47 } else {
48 base_params.emplace_back("action", "wbgetentities");
49 }
50
51 std::string prefix {};
53 prefix = "EntitySchema:";
54 }
55 nlohmann::json combined = nlohmann::json::object();
56 for (auto&& chunk : batch | std::views::chunk(opt.batch_threshold)) {
57 std::vector<std::string> chunk_vec;
58 for (const auto& id : chunk) {
59 if (arachne::identify(id) != kind) {
60 continue;
61 }
62 chunk_vec.emplace_back(prefix + id);
63 }
64 corespace::parameter_list params { base_params };
65 auto entities = join_str(chunk_vec);
66
67 if (kind == corespace::entity_kind::entity_schema) {
68 params.emplace_back("titles", entities);
69 params.emplace_back("prop", props);
70 } else {
71 params.emplace_back("ids", entities);
72 params.emplace_back("props", props);
73 }
74 auto r = client.get(url, params);
75 auto data = nlohmann::json::parse(r.text, nullptr, true);
76 if (!data.is_object()) {
77 continue;
78 }
79 combined.merge_patch(data);
80 }
81 return combined;
82}
83
85 const auto
86 [method, url, query_params, form_params, body, content_type, accept,
87 timeout_sec, use_form_body]
88 = build_call_preview(request);
89 if (method == corespace::http_method::get) {
90 return nlohmann::json::parse(
91 client.get(url, query_params, accept, timeout_sec).text, nullptr,
92 true
93 );
94 }
95 if (use_form_body) {
96 return nlohmann::json::parse(
97 client
98 .post_form(url, form_params, query_params, accept, timeout_sec)
99 .text,
100 nullptr, true
101 );
102 }
103 return nlohmann::json::parse(
104 client
105 .post_raw(
106 url, body, content_type, query_params, accept, timeout_sec
107 )
108 .text,
109 nullptr, true
110 );
111}
112
113nlohmann::json pheidippides::wdqs(std::string query) {
114 corespace::sparql_request request;
115 request.query = std::move(query);
116 return sparql(request);
117}
118
119const corespace::network_metrics& pheidippides::metrics_info() const {
120 return client.metrics_info();
121}
122
125 return build_call_preview(request);
126}
127
128std::string pheidippides::join_str(
129 std::span<const std::string> ids, const std::string_view separator
130) {
131 if (ids.empty()) {
132 return {};
133 }
134 auto it = ids.begin();
135 std::string result = *it;
136 for (++it; it != ids.end(); ++it) {
138 result.append(*it);
139 }
140 return result;
141}
142
144 const corespace::sparql_request& request
145) const {
146 using namespace corespace;
147
148 call_preview preview;
149 const auto& profile = get_service_profile(service_kind::wdqs);
150 preview.url = profile.base_url;
151
152 const std::size_t threshold
153 = request.length_threshold == sparql_request::service_default
155 : request.length_threshold;
156
157 const auto method = choose_http_method(request, threshold);
158 preview.method = method;
159
160 preview.timeout_sec
161 = request.timeout_sec >= 0 ? request.timeout_sec : wdqs_opt.timeout_sec;
162
163 preview.accept = resolve_accept(request, profile, wdqs_opt.accept_override);
164
165 if (method == http_method::get) {
166 preview.query_params.emplace_back("query", request.query);
167 append_common_params(service_kind::wdqs, method, preview.query_params);
168 } else {
169 const auto [content_type, use_form_body]
170 = resolve_body_strategy(request);
171
172 preview.content_type = content_type;
173 preview.use_form_body = use_form_body;
174 if (preview.use_form_body) {
175 preview.form_params.emplace_back("query", request.query);
176 sort_parameters(preview.form_params);
177 } else {
178 preview.body = request.query;
179 }
180 append_common_params(service_kind::wdqs, method, preview.query_params);
181 }
182
183 return preview;
184}
185}
Batch courier for Wikidata/Commons: collects IDs, issues HTTP requests, and returns a merged JSON pay...
corespace::call_preview preview(const corespace::sparql_request &request) const
Produce a call preview describing the HTTP request that would be made.
nlohmann::json wdqs(std::string query)
Convenience wrapper to run a raw SPARQL query string.
corespace::call_preview build_call_preview(const corespace::sparql_request &request) const
corespace::wdqs_options wdqs_opt
nlohmann::json sparql(const corespace::sparql_request &request)
Execute a SPARQL query according to the provided request.
const corespace::network_metrics & metrics_info() const
Access aggregated network metrics of the underlying client.
nlohmann::json fetch_json(const std::unordered_set< std::string > &batch, corespace::entity_kind kind=corespace::entity_kind::any)
Fetch metadata for a set of entity IDs and return a merged JSON object.
http_method choose_http_method(const sparql_request &request, const std::size_t threshold)
Chooses the appropriate HTTP method for a SPARQL request.
Definition utils.cpp:42
http_method
HTTP method to use for a request.
Definition utils.hpp:201
entity_kind
Wikidata entity kind.
Definition utils.hpp:47
@ mediainfo
IDs prefixed with 'M'.
Definition utils.hpp:51
@ entity_schema
IDs prefixed with 'E'.
Definition utils.hpp:52
service_kind
Identifies supported SPARQL services.
Definition utils.hpp:74
const service_profile & get_service_profile(const service_kind kind)
Retrieve the service profile for a given service kind.
Definition utils.cpp:87
int timeout_sec
Per-request timeout in seconds (-1 for default).
Definition utils.hpp:272
std::string url
Full request URL (excluding query parameters).
Definition utils.hpp:259
http_method method
HTTP method to use for the request (GET, POST, etc.).
Definition utils.hpp:256
std::string content_type
Content-Type header value for the request body.
Definition utils.hpp:269
std::size_t length_threshold
Definition utils.hpp:250