Arachne 1.0
Arachne - the perpetual stitcher of Wikidata entities.
Loading...
Searching...
No Matches
utils.hpp
Go to the documentation of this file.
1/*
2 * The MIT License (MIT)
3 *
4 * Copyright (c) 2025 Yaroslav Riabtsev <yaroslav.riabtsev@rwth-aachen.de>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#ifndef ARACHNE_UTILS_HPP
26#define ARACHNE_UTILS_HPP
27#include <array>
28#include <atomic>
29#include <curl/curl.h>
30#include <limits>
31#include <map>
32#include <string>
33#include <vector>
34
35namespace corespace {
36
38/**
39 * @brief Wikidata entity kind.
40 *
41 * Names include the canonical identifier prefixes for clarity:
42 * - item (IDs such as "Q123"), property ("P45"), lexeme ("L7"),
43 * mediainfo ("M9"), entity_schema ("E2"), form ("L7-F1"), sense ("L7-S2").
44 * `any` acts as an API selector; `unknown` denotes an invalid or
45 * unrecognized identifier.
46 */
47enum class entity_kind {
48 item, ///< IDs prefixed with 'Q'.
49 property, ///< IDs prefixed with 'P'.
50 lexeme, ///< IDs prefixed with 'L'.
51 mediainfo, ///< IDs prefixed with 'M'.
52 entity_schema, ///< IDs prefixed with 'E'.
53 form, ///< Lexeme form IDs such as "L<lexeme>-F<form>".
54 sense, ///< Lexeme sense IDs such as "L<lexeme>-S<sense>".
55 any, ///< API selector (e.g., flush(any)); not directly batchable.
56 unknown ///< Unrecognized/invalid identifier.
57};
58
59/// @brief Single query parameter: key=value (pre-encoding is handled by
60/// libcurl).
61using parameter = std::pair<std::string, std::string>;
62/// @brief Ordered list of query parameters appended to the URL.
64
65/**
66 * @brief Identifies supported SPARQL services.
67 *
68 * Used to select which SPARQL endpoint to query. Currently only
69 * `wdqs` (Wikidata Query Service) is supported.
70 *
71 * Values:
72 * - wdqs: Wikidata Query Service (https://query.wikidata.org)
73 */
75
76/**
77 * @struct options
78 * @brief Configuration for fetching entities via MediaWiki/Wikibase API.
79 *
80 * Semantics:
81 * - `batch_threshold`: maximum number of IDs or titles per request chunk.
82 * - `prop`: fields requested for EntitySchema queries (`action=query`).
83 * - `props`: fields requested for `wbgetentities` (Q/P/L/M).
84 * - `params`: base parameters applied to all requests (languages, format,
85 * revision content, normalization, and related API flags).
86 */
87struct options {
89
90 std::vector<std::string> prop = { "info", "revisions" };
92 = { "aliases", "claims", "datatype", "descriptions",
93 "info", "labels", "sitelinks/urls" };
94
95 parameter_list params { { "languages", "en" }, { "languagefallback", "1" },
96 { "format", "json" }, { "formatversion", "2" },
97 { "rvslots", "main" }, { "rvprop", "content" },
98 { "normalize", "1" } };
99};
100
101/**
102 * @struct network_metrics
103 * @brief Thread-safe counters describing client-side networking activity.
104 *
105 * Semantics:
106 * - `requests` counts finished transfer attempts (successful or not).
107 * - `retries` counts retry cycles triggered by retryable outcomes.
108 * - `sleep_ms` is the total backoff time slept between attempts.
109 * - `network_ms` is the accumulated wall-clock duration spent inside
110 * libcurl for performed requests (sum over attempts).
111 * - `bytes_received` sums body sizes appended via the write callback.
112 * - `statuses[i]` counts responses with HTTP status `i` (0..599). Values
113 * outside the array bounds are ignored.
114 *
115 * All counters are atomics and rely on the default sequentially consistent
116 * operations provided by `std::atomic`. Readers observe eventually consistent
117 * snapshots without additional synchronization.
118 */
119struct network_metrics final {
120 std::atomic<unsigned> requests {
121 0
122 }; ///< Finished attempts (success or failure).
123 std::atomic<unsigned> retries { 0 }; ///< Number of retry cycles triggered.
124 std::atomic<long long> sleep_ms {
125 0
126 }; ///< Total backoff duration slept (ms).
127 std::atomic<long long> network_ms {
128 0
129 }; ///< Total time spent in libcurl (ms).
131 0
132 }; ///< Sum of response body sizes (bytes).
133 std::array<std::atomic<unsigned>, 600>
134 statuses; ///< Per-code histogram for HTTP 0..599.
135
136 /**
137 * @brief Zero-initialize per-status counters.
138 *
139 * The constructor explicitly clears the `statuses` histogram.
140 */
142};
143
144/**
145 * @struct http_response
146 * @brief Result object for an HTTP transfer.
147 *
148 * Invariants:
149 * - `error_code == CURLE_OK` means libcurl completed without a transport
150 * error.
151 * - `status_code` carries the HTTP status (2xx denotes success).
152 * - `header` contains response headers from the final transfer attempt.
153 * - `text` accumulates the response body as received.
154 * - When `error_code != CURLE_OK`, `error_message` contains a stable
155 * human-readable description (from `curl_easy_strerror`).
156 */
158 /// Case-preserving multimap of response headers (as returned by libcurl).
160
161 size_t status_code = 0; ///< HTTP status code (e.g., 200, 404).
162 header_map header; ///< Response headers from the final attempt.
163 std::string text; ///< Response body accumulated across callbacks.
164 CURLcode error_code = CURLE_OK; ///< libcurl transport/result code.
165 std::string error_message; ///< Non-empty on libcurl error.
166};
167
168/**
169 * @struct network_options
170 * @brief Fixed runtime options for the HTTP client.
171 *
172 * Timeouts and retry policy:
173 * - `timeout_ms`: total operation timeout (libcurl `CURLOPT_TIMEOUT_MS`).
174 * - `connect_ms`: connect timeout (libcurl `CURLOPT_CONNECTTIMEOUT_MS`).
175 * - `max_retries`: maximum number of retries after the first attempt.
176 * - `retry_base_ms`: base delay for exponential backoff with jitter.
177 * - `retry_max_ms`: hard cap for a single backoff sleep.
178 *
179 * Headers and identity:
180 * - `accept`: value for the `Accept:` request header.
181 * - `user_agent`: value for the `User-Agent:` request header.
182 */
184 int timeout_ms = 10000; ///< Total request timeout (ms).
185 int connect_ms = 3000; ///< Connect timeout (ms).
186 int max_retries = 3; ///< Max retry attempts after the first try.
187 int retry_base_ms = 200; ///< Base for exponential backoff (ms).
188 long long retry_max_ms = 3000; ///< Max per-attempt backoff (ms).
189
190 std::string accept = "application/json"; ///< Default Accept header.
191 std::string user_agent = "arachne/client"; ///< Default User-Agent.
192};
193
194/**
195 * @brief HTTP method to use for a request.
196 *
197 * Represents the actual HTTP method used when sending a request.
198 * - `get`: Use the HTTP GET method.
199 * - `post`: Use the HTTP POST method.
200 */
201enum class http_method { get, post };
202/**
203 * @brief Hint for selecting the HTTP method for a request.
204 *
205 * Used to determine which HTTP method to use based on query length or explicit
206 * override.
207 * - `automatic`: Selects GET or POST based on query length (e.g., GET for short
208 * queries, POST for long).
209 * - `force_get`: Forces the use of GET regardless of query length.
210 * - `force_post`: Forces the use of POST regardless of query length.
211 *
212 * This differs from `http_method` in that it provides a policy for method
213 * selection, rather than specifying the method directly.
214 */
216
227
228/**
229 * @struct service_profile
230 * @brief Static configuration values describing a remote service.
231 *
232 * Contains the base endpoint URL, the default Accept header value used
233 * when a request does not specify one, and optional rate hint strings
234 * (for example, "polite" or "limit") that guide client throttling.
235 */
241
242/**
243 * @brief Options specific to WDQS usage and heuristics.
244 *
245 * - length_threshold: query length above which POST is preferred.
246 * - timeout_sec: per-request timeout in seconds.
247 * - accept_override: optional runtime Accept header override.
248 */
250 std::size_t length_threshold = 1800;
251 int timeout_sec = 60;
252 std::string accept_override;
253};
254
258 }; ///< HTTP method to use for the request (GET, POST, etc.).
259 std::string url; ///< Full request URL (excluding query parameters).
261 query_params; ///< Parameters to be appended to the URL as a query
262 ///< string (for GET/URL-encoded requests).
264 form_params; ///< Parameters to be sent in the request body as form data
265 ///< (for POST requests with form encoding).
266 std::string body; ///< Raw request body (used for POST requests with
267 ///< non-form content, e.g., JSON or SPARQL).
268 std::string
269 content_type; ///< Content-Type header value for the request body.
270 std::string
271 accept; ///< Accept header value indicating expected response format.
272 int timeout_sec = -1; ///< Per-request timeout in seconds (-1 for default).
273 bool use_form_body { false }; ///< If true, send form_params as the request
274 ///< body; otherwise, use raw body.
275
276 /**
277 * @brief Check whether a query parameter with key @p key exists.
278 * @param key Key to search for.
279 * @return true if a parameter with the given key is present.
280 */
281 [[nodiscard]] bool has_param(std::string_view key) const;
282
283 /**
284 * @brief Retrieve the first value for query parameter @p key.
285 * @param key Key to search for.
286 * @return Value associated with @p key or empty string if not found.
287 */
288 [[nodiscard]] std::string get_param(std::string_view key) const;
289};
290
291/**
292 * @brief Retrieve the service profile for a given service kind.
293 * @param kind The service kind to look up.
294 * @return Reference to the corresponding service_profile.
295 */
297/**
298 * @brief Sorts the parameter list in-place by key.
299 * @param params The parameter list to sort. Modified in-place.
300 * @note Side effect: The input parameter list is reordered.
301 */
302void sort_parameters(parameter_list& params);
303/**
304 * @brief Appends common parameters required for a service and HTTP method.
305 * @param kind The service kind.
306 * @param method The HTTP method.
307 * @param params The parameter list to append to. Modified in-place.
308 * @note Side effect: The input parameter list is extended.
309 */
311 service_kind kind, http_method method, parameter_list& params
312);
313/**
314 * @brief Chooses the appropriate HTTP method for a SPARQL request.
315 * @param request The SPARQL request.
316 * @param threshold The length threshold above which POST is preferred.
317 * @return The selected HTTP method (GET or POST).
318 */
320choose_http_method(const sparql_request& request, std::size_t threshold);
321/**
322 * @brief Resolves the Accept header value for a SPARQL request.
323 * @param request The SPARQL request.
324 * @param profile The service profile.
325 * @param override_accept Optional override for the Accept header.
326 * @return The resolved Accept header value.
327 */
328std::string resolve_accept(
329 const sparql_request& request, const service_profile& profile,
330 std::string_view override_accept
331);
332/**
333 * @brief Determines the body content and strategy for a SPARQL request.
334 * @param request The SPARQL request.
335 * @return A pair where:
336 * - first: The body content as a string.
337 * - second: A boolean indicating whether to use form body (true) or raw body
338 * (false).
339 */
340std::pair<std::string, bool>
342
343}
344#endif // ARACHNE_UTILS_HPP
Accumulates entity IDs into per-kind batches and organizes groups.
Definition arachne.hpp:47
std::unordered_map< std::string, int > candidates
Definition arachne.hpp:280
std::array< std::unordered_set< std::string >, batched_kind_count > extra_batches
Definition arachne.hpp:273
bool touch_entity(const std::string &id_with_prefix) noexcept
Increment the touch counter for a single full ID (prefix REQUIRED).
Definition arachne.cpp:224
static std::string entity_root(const std::string &id)
Extract the lexeme root from a full ID string.
Definition arachne.cpp:74
std::string current_group
Definition arachne.hpp:290
int touch_ids(std::span< const int > ids, corespace::entity_kind kind)
Batch variant of touch for numeric IDs.
Definition arachne.cpp:59
static bool parse_id(const std::string &entity, size_t &pos, int &id)
Parse a full ID string and extract the numeric portion.
Definition arachne.cpp:149
bool new_group(std::string name="")
Create or select a group and make it current.
Definition arachne.cpp:31
size_t add_entity(const std::string &id_with_prefix, bool force=false, std::string name="")
Enqueue a full (prefixed) ID string and add it to a group.
Definition arachne.cpp:235
std::unordered_map< std::string, std::unordered_set< std::string > > groups
Definition arachne.hpp:277
std::chrono::milliseconds staleness_threshold
Definition arachne.hpp:291
bool enqueue(std::string_view id, corespace::entity_kind kind, bool interactive) const
Decide whether an entity should be enqueued for fetching.
Definition arachne.cpp:201
const size_t batch_threshold
Typical unauthenticated entity-per-request cap.
Definition arachne.hpp:284
pheidippides phe_client
Definition arachne.hpp:293
const int candidates_threshold
Intentional high bar for curiosity-driven candidates.
Definition arachne.hpp:286
static std::string normalize(int id, corespace::entity_kind kind)
Normalize a numeric ID with the given kind to a prefixed string.
Definition arachne.cpp:165
static bool ask_update(std::string_view id, corespace::entity_kind kind, std::chrono::milliseconds age)
Placeholder for interactive staleness confirmation.
Definition arachne.cpp:194
void select_group(std::string name)
Select an existing group or create it on demand.
Definition arachne.cpp:184
std::array< std::unordered_set< std::string >, batched_kind_count > main_batches
Definition arachne.hpp:271
int queue_size(corespace::entity_kind kind) const noexcept
Get the number of queued (pending) entities tracked in the main batch containers.
Definition arachne.cpp:107
corespace::interface ui
Definition arachne.hpp:292
static corespace::entity_kind identify(const std::string &entity) noexcept
Determine the kind of a full ID string.
Definition arachne.cpp:122
bool flush(corespace::entity_kind kind=corespace::entity_kind::any)
Flush (send) up to batch_threshold entities of a specific kind.
Definition arachne.cpp:99
size_t add_ids(std::span< const int > ids, corespace::entity_kind kind, std::string name="")
Enqueue numeric IDs with a given kind and add them to a group.
Definition arachne.cpp:42
Batch courier for Wikidata/Commons: collects IDs, issues HTTP requests, and returns a merged JSON pay...
corespace::call_preview preview(const corespace::sparql_request &request) const
Produce a call preview describing the HTTP request that would be made.
corespace::http_client client
Reused HTTP client (not thread-safe across threads).
nlohmann::json wdqs(std::string query)
Convenience wrapper to run a raw SPARQL query string.
corespace::call_preview build_call_preview(const corespace::sparql_request &request) const
corespace::wdqs_options wdqs_opt
nlohmann::json sparql(const corespace::sparql_request &request)
Execute a SPARQL query according to the provided request.
static std::string join_str(std::span< const std::string > ids, std::string_view separator="|")
Join a span of strings with a separator (no encoding or validation).
const corespace::network_metrics & metrics_info() const
Access aggregated network metrics of the underlying client.
nlohmann::json fetch_json(const std::unordered_set< std::string > &batch, corespace::entity_kind kind=corespace::entity_kind::any)
Fetch metadata for a set of entity IDs and return a merged JSON object.
corespace::options opt
Request shaping parameters (chunking, fields, base params).
static bool status_retry(const http_response &response, bool net_ok)
Retry predicate for transient outcomes.
std::unique_ptr< curl_slist, decltype(&curl_slist_free_all)> header_list
Owned request header list.
void update_headers(http_response &response) const
Refresh the header multimap from the last transfer.
http_response request_get(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view accept={}, int timeout_sec=-1) const
Execute a single HTTP GET using the prepared URL handle.
http_client()
Construct a client and initialize libcurl.
const network_metrics & metrics_info() const
Access aggregated network metrics.
network_metrics metrics
Aggregated metrics (atomic counters).
http_response post_raw(std::string_view url, std::string_view body, std::string_view content_type, const parameter_list &query={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP POST with a raw body.
long long next_delay(int attempt) const
Compute the next backoff delay for attempt (1-based).
const network_options opt
Fixed options installed at construction.
static curl_url_ptr build_url(std::string_view url, const parameter_list &params)
Construct a CURLU handle from url and append params.
http_response get(std::string_view url, const parameter_list &params={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP GET to url with optional query params.
http_response request_post(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view content_type, std::string_view body, std::string_view accept={}, int timeout_sec=-1) const
Execute a single HTTP POST with given body and content type.
static bool status_good(const http_response &response)
Success predicate: transport OK and HTTP 2xx.
http_response post_form(std::string_view url, const parameter_list &form, const parameter_list &query={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP POST with form-encoded body.
void apply_server_retry_hint(long long &sleep_ms) const
Apply server-provided retry hint if present.
std::unique_ptr< CURLU, decltype(&curl_url_cleanup)> curl_url_ptr
Unique pointer type for CURLU with proper deleter.
std::string build_form_body(const parameter_list &form) const
void update_metrics(const http_response &response, std::chrono::milliseconds elapsed)
Update counters and histograms after an attempt.
std::unique_ptr< CURL, decltype(&curl_easy_cleanup)> curl
Reused easy handle (not thread-safe).
static size_t write_callback(const char *ptr, size_t size, size_t n, void *data)
libcurl write callback: append chunk to response body.
static constexpr std::string prefixes
Definition arachne.cpp:29
constexpr std::size_t batched_kind_count
Number of batchable kinds (Q, P, L, M, E, form, sense).
Definition arachne.hpp:33
std::string resolve_accept(const sparql_request &request, const service_profile &profile, const std::string_view override_accept)
Resolves the Accept header value for a SPARQL request.
Definition utils.cpp:55
http_method choose_http_method(const sparql_request &request, const std::size_t threshold)
Chooses the appropriate HTTP method for a SPARQL request.
Definition utils.cpp:42
std::pair< std::string, bool > resolve_body_strategy(const sparql_request &request)
Determines the body content and strategy for a SPARQL request.
Definition utils.cpp:69
http_method
HTTP method to use for a request.
Definition utils.hpp:201
void sort_parameters(parameter_list &params)
Sorts the parameter list in-place by key.
Definition utils.cpp:96
entity_kind
Wikidata entity kind.
Definition utils.hpp:47
@ any
API selector (e.g., flush(any)); not directly batchable.
Definition utils.hpp:55
@ property
IDs prefixed with 'P'.
Definition utils.hpp:49
@ lexeme
IDs prefixed with 'L'.
Definition utils.hpp:50
@ form
Lexeme form IDs such as "L<lexeme>-F<form>".
Definition utils.hpp:53
@ item
IDs prefixed with 'Q'.
Definition utils.hpp:48
@ mediainfo
IDs prefixed with 'M'.
Definition utils.hpp:51
@ entity_schema
IDs prefixed with 'E'.
Definition utils.hpp:52
@ unknown
Unrecognized/invalid identifier.
Definition utils.hpp:56
@ sense
Lexeme sense IDs such as "L<lexeme>-S<sense>".
Definition utils.hpp:54
service_kind
Identifies supported SPARQL services.
Definition utils.hpp:74
const service_profile & get_service_profile(const service_kind kind)
Retrieve the service profile for a given service kind.
Definition utils.cpp:87
void append_common_params(const service_kind kind, const http_method method, parameter_list &params)
Appends common parameters required for a service and HTTP method.
Definition utils.cpp:105
std::pair< std::string, std::string > parameter
Single query parameter: key=value (pre-encoding is handled by libcurl).
Definition utils.hpp:61
std::string random_hex(const std::size_t n)
Return exactly n random hexadecimal characters (lowercase).
Definition rng.cpp:33
http_method_hint
Hint for selecting the HTTP method for a request.
Definition utils.hpp:215
parameter_list form_params
Definition utils.hpp:264
int timeout_sec
Per-request timeout in seconds (-1 for default).
Definition utils.hpp:272
std::string url
Full request URL (excluding query parameters).
Definition utils.hpp:259
bool has_param(std::string_view key) const
Check whether a query parameter with key key exists.
Definition utils.cpp:124
parameter_list query_params
Definition utils.hpp:261
std::string get_param(std::string_view key) const
Retrieve the first value for query parameter key.
Definition utils.cpp:130
http_method method
HTTP method to use for the request (GET, POST, etc.).
Definition utils.hpp:256
std::string accept
Accept header value indicating expected response format.
Definition utils.hpp:271
std::string content_type
Content-Type header value for the request body.
Definition utils.hpp:269
Result object for an HTTP transfer.
Definition utils.hpp:157
std::string error_message
Non-empty on libcurl error.
Definition utils.hpp:165
std::string text
Response body accumulated across callbacks.
Definition utils.hpp:163
header_map header
Response headers from the final attempt.
Definition utils.hpp:162
size_t status_code
HTTP status code (e.g., 200, 404).
Definition utils.hpp:161
CURLcode error_code
libcurl transport/result code.
Definition utils.hpp:164
std::atomic< long long > network_ms
Total time spent in libcurl (ms).
Definition utils.hpp:127
network_metrics()
Zero-initialize per-status counters.
Definition utils.cpp:81
std::atomic< unsigned > requests
Finished attempts (success or failure).
Definition utils.hpp:120
std::atomic< long long > sleep_ms
Total backoff duration slept (ms).
Definition utils.hpp:124
std::atomic< unsigned > retries
Number of retry cycles triggered.
Definition utils.hpp:123
std::atomic< size_t > bytes_received
Sum of response body sizes (bytes).
Definition utils.hpp:130
std::array< std::atomic< unsigned >, 600 > statuses
Per-code histogram for HTTP 0..599.
Definition utils.hpp:134
Fixed runtime options for the HTTP client.
Definition utils.hpp:183
int retry_base_ms
Base for exponential backoff (ms).
Definition utils.hpp:187
int connect_ms
Connect timeout (ms).
Definition utils.hpp:185
int timeout_ms
Total request timeout (ms).
Definition utils.hpp:184
std::string accept
Default Accept header.
Definition utils.hpp:190
std::string user_agent
Default User-Agent.
Definition utils.hpp:191
int max_retries
Max retry attempts after the first try.
Definition utils.hpp:186
long long retry_max_ms
Max per-attempt backoff (ms).
Definition utils.hpp:188
Configuration for fetching entities via MediaWiki/Wikibase API.
Definition utils.hpp:87
parameter_list params
Definition utils.hpp:95
std::vector< std::string > props
Definition utils.hpp:92
std::vector< std::string > prop
Definition utils.hpp:90
size_t batch_threshold
Definition utils.hpp:88
Static configuration values describing a remote service.
Definition utils.hpp:236
std::vector< std::string > rate_hints
Definition utils.hpp:239
std::string default_accept
Definition utils.hpp:238
std::string content_type
Definition utils.hpp:225
static constexpr size_t service_default
Definition utils.hpp:221
http_method_hint method
Definition utils.hpp:219
Options specific to WDQS usage and heuristics.
Definition utils.hpp:249
std::string accept_override
Definition utils.hpp:252
std::size_t length_threshold
Definition utils.hpp:250