2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
25#ifndef ARACHNE_UTILS_HPP
26#define ARACHNE_UTILS_HPP
39
40
41
42
43
44
45
46
61using parameter = std::pair<std::string, std::string>;
66
67
68
69
70
71
72
73
77
78
79
80
81
82
83
84
85
86
92 = {
"aliases",
"claims",
"datatype",
"descriptions",
93 "info",
"labels",
"sitelinks/urls" };
96 {
"format",
"json" }, {
"formatversion",
"2" },
97 {
"rvslots",
"main" }, {
"rvprop",
"content" },
98 {
"normalize",
"1" } };
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119struct network_metrics
final {
133 std::array<std::atomic<
unsigned>, 600>
137
138
139
140
145
146
147
148
149
150
151
152
153
154
155
156
169
170
171
172
173
174
175
176
177
178
179
180
181
182
195
196
197
198
199
200
203
204
205
206
207
208
209
210
211
212
213
214
229
230
231
232
233
234
235
243
244
245
246
247
248
277
278
279
280
284
285
286
287
292
293
294
295
298
299
300
301
304
305
306
307
308
309
314
315
316
317
318
322
323
324
325
326
327
330 std::string_view override_accept
333
334
335
336
337
338
339
340std::pair<std::string,
bool>
Accumulates entity IDs into per-kind batches and organizes groups.
std::unordered_map< std::string, int > candidates
std::array< std::unordered_set< std::string >, batched_kind_count > extra_batches
bool touch_entity(const std::string &id_with_prefix) noexcept
Increment the touch counter for a single full ID (prefix REQUIRED).
static std::string entity_root(const std::string &id)
Extract the lexeme root from a full ID string.
std::string current_group
int touch_ids(std::span< const int > ids, corespace::entity_kind kind)
Batch variant of touch for numeric IDs.
static bool parse_id(const std::string &entity, size_t &pos, int &id)
Parse a full ID string and extract the numeric portion.
bool new_group(std::string name="")
Create or select a group and make it current.
size_t add_entity(const std::string &id_with_prefix, bool force=false, std::string name="")
Enqueue a full (prefixed) ID string and add it to a group.
std::unordered_map< std::string, std::unordered_set< std::string > > groups
std::chrono::milliseconds staleness_threshold
bool enqueue(std::string_view id, corespace::entity_kind kind, bool interactive) const
Decide whether an entity should be enqueued for fetching.
const size_t batch_threshold
Typical unauthenticated entity-per-request cap.
const int candidates_threshold
Intentional high bar for curiosity-driven candidates.
static std::string normalize(int id, corespace::entity_kind kind)
Normalize a numeric ID with the given kind to a prefixed string.
static bool ask_update(std::string_view id, corespace::entity_kind kind, std::chrono::milliseconds age)
Placeholder for interactive staleness confirmation.
void select_group(std::string name)
Select an existing group or create it on demand.
std::array< std::unordered_set< std::string >, batched_kind_count > main_batches
int queue_size(corespace::entity_kind kind) const noexcept
Get the number of queued (pending) entities tracked in the main batch containers.
static corespace::entity_kind identify(const std::string &entity) noexcept
Determine the kind of a full ID string.
bool flush(corespace::entity_kind kind=corespace::entity_kind::any)
Flush (send) up to batch_threshold entities of a specific kind.
size_t add_ids(std::span< const int > ids, corespace::entity_kind kind, std::string name="")
Enqueue numeric IDs with a given kind and add them to a group.
Batch courier for Wikidata/Commons: collects IDs, issues HTTP requests, and returns a merged JSON pay...
corespace::call_preview preview(const corespace::sparql_request &request) const
Produce a call preview describing the HTTP request that would be made.
corespace::http_client client
Reused HTTP client (not thread-safe across threads).
nlohmann::json wdqs(std::string query)
Convenience wrapper to run a raw SPARQL query string.
corespace::call_preview build_call_preview(const corespace::sparql_request &request) const
corespace::wdqs_options wdqs_opt
nlohmann::json sparql(const corespace::sparql_request &request)
Execute a SPARQL query according to the provided request.
static std::string join_str(std::span< const std::string > ids, std::string_view separator="|")
Join a span of strings with a separator (no encoding or validation).
const corespace::network_metrics & metrics_info() const
Access aggregated network metrics of the underlying client.
nlohmann::json fetch_json(const std::unordered_set< std::string > &batch, corespace::entity_kind kind=corespace::entity_kind::any)
Fetch metadata for a set of entity IDs and return a merged JSON object.
corespace::options opt
Request shaping parameters (chunking, fields, base params).
static bool status_retry(const http_response &response, bool net_ok)
Retry predicate for transient outcomes.
std::unique_ptr< curl_slist, decltype(&curl_slist_free_all)> header_list
Owned request header list.
void update_headers(http_response &response) const
Refresh the header multimap from the last transfer.
http_response request_get(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view accept={}, int timeout_sec=-1) const
Execute a single HTTP GET using the prepared URL handle.
http_client()
Construct a client and initialize libcurl.
const network_metrics & metrics_info() const
Access aggregated network metrics.
network_metrics metrics
Aggregated metrics (atomic counters).
http_response post_raw(std::string_view url, std::string_view body, std::string_view content_type, const parameter_list &query={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP POST with a raw body.
long long next_delay(int attempt) const
Compute the next backoff delay for attempt (1-based).
const network_options opt
Fixed options installed at construction.
static curl_url_ptr build_url(std::string_view url, const parameter_list ¶ms)
Construct a CURLU handle from url and append params.
http_response get(std::string_view url, const parameter_list ¶ms={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP GET to url with optional query params.
http_response request_post(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view content_type, std::string_view body, std::string_view accept={}, int timeout_sec=-1) const
Execute a single HTTP POST with given body and content type.
static bool status_good(const http_response &response)
Success predicate: transport OK and HTTP 2xx.
http_response post_form(std::string_view url, const parameter_list &form, const parameter_list &query={}, std::string_view accept={}, int timeout_sec=-1)
Perform an HTTP POST with form-encoded body.
void apply_server_retry_hint(long long &sleep_ms) const
Apply server-provided retry hint if present.
std::unique_ptr< CURLU, decltype(&curl_url_cleanup)> curl_url_ptr
Unique pointer type for CURLU with proper deleter.
std::string build_form_body(const parameter_list &form) const
void update_metrics(const http_response &response, std::chrono::milliseconds elapsed)
Update counters and histograms after an attempt.
std::unique_ptr< CURL, decltype(&curl_easy_cleanup)> curl
Reused easy handle (not thread-safe).
static size_t write_callback(const char *ptr, size_t size, size_t n, void *data)
libcurl write callback: append chunk to response body.
static constexpr std::string prefixes
constexpr std::size_t batched_kind_count
Number of batchable kinds (Q, P, L, M, E, form, sense).
std::string resolve_accept(const sparql_request &request, const service_profile &profile, const std::string_view override_accept)
Resolves the Accept header value for a SPARQL request.
http_method choose_http_method(const sparql_request &request, const std::size_t threshold)
Chooses the appropriate HTTP method for a SPARQL request.
std::pair< std::string, bool > resolve_body_strategy(const sparql_request &request)
Determines the body content and strategy for a SPARQL request.
http_method
HTTP method to use for a request.
void sort_parameters(parameter_list ¶ms)
Sorts the parameter list in-place by key.
entity_kind
Wikidata entity kind.
@ any
API selector (e.g., flush(any)); not directly batchable.
@ property
IDs prefixed with 'P'.
@ lexeme
IDs prefixed with 'L'.
@ form
Lexeme form IDs such as "L<lexeme>-F<form>".
@ item
IDs prefixed with 'Q'.
@ mediainfo
IDs prefixed with 'M'.
@ entity_schema
IDs prefixed with 'E'.
@ unknown
Unrecognized/invalid identifier.
@ sense
Lexeme sense IDs such as "L<lexeme>-S<sense>".
service_kind
Identifies supported SPARQL services.
const service_profile & get_service_profile(const service_kind kind)
Retrieve the service profile for a given service kind.
void append_common_params(const service_kind kind, const http_method method, parameter_list ¶ms)
Appends common parameters required for a service and HTTP method.
std::pair< std::string, std::string > parameter
Single query parameter: key=value (pre-encoding is handled by libcurl).
std::string random_hex(const std::size_t n)
Return exactly n random hexadecimal characters (lowercase).
http_method_hint
Hint for selecting the HTTP method for a request.
parameter_list form_params
int timeout_sec
Per-request timeout in seconds (-1 for default).
std::string url
Full request URL (excluding query parameters).
bool has_param(std::string_view key) const
Check whether a query parameter with key key exists.
parameter_list query_params
std::string get_param(std::string_view key) const
Retrieve the first value for query parameter key.
http_method method
HTTP method to use for the request (GET, POST, etc.).
std::string accept
Accept header value indicating expected response format.
std::string content_type
Content-Type header value for the request body.
Result object for an HTTP transfer.
std::string error_message
Non-empty on libcurl error.
std::string text
Response body accumulated across callbacks.
header_map header
Response headers from the final attempt.
size_t status_code
HTTP status code (e.g., 200, 404).
CURLcode error_code
libcurl transport/result code.
std::atomic< long long > network_ms
Total time spent in libcurl (ms).
network_metrics()
Zero-initialize per-status counters.
std::atomic< unsigned > requests
Finished attempts (success or failure).
std::atomic< long long > sleep_ms
Total backoff duration slept (ms).
std::atomic< unsigned > retries
Number of retry cycles triggered.
std::atomic< size_t > bytes_received
Sum of response body sizes (bytes).
std::array< std::atomic< unsigned >, 600 > statuses
Per-code histogram for HTTP 0..599.
Fixed runtime options for the HTTP client.
int retry_base_ms
Base for exponential backoff (ms).
int connect_ms
Connect timeout (ms).
int timeout_ms
Total request timeout (ms).
std::string accept
Default Accept header.
std::string user_agent
Default User-Agent.
int max_retries
Max retry attempts after the first try.
long long retry_max_ms
Max per-attempt backoff (ms).
Configuration for fetching entities via MediaWiki/Wikibase API.
std::vector< std::string > props
std::vector< std::string > prop
Static configuration values describing a remote service.
std::vector< std::string > rate_hints
std::string default_accept
static constexpr size_t service_default
Options specific to WDQS usage and heuristics.
std::string accept_override
std::size_t length_threshold