2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
25#ifndef ARACHNE_UTILS_HPP
26#define ARACHNE_UTILS_HPP
38
39
40
41
42
43
44
45
60using parameter = std::pair<std::string, std::string>;
65
66
67
68
69
70
71
72
73
74
80 = {
"aliases",
"claims",
"datatype",
"descriptions",
81 "info",
"labels",
"sitelinks/urls" };
84 {
"format",
"json" }, {
"formatversion",
"2" },
85 {
"rvslots",
"main" }, {
"rvprop",
"content" },
86 {
"normalize",
"1" } };
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107struct network_metrics
final {
121 std::array<std::atomic<
unsigned>, 600>
125
126
127
128
133
134
135
136
137
138
139
140
141
142
143
144
157
158
159
160
161
162
163
164
165
166
167
168
169
170
Accumulates entity IDs into per-kind batches and organizes groups.
std::unordered_map< std::string, int > candidates
std::array< std::unordered_set< std::string >, batched_kind_count > extra_batches
bool touch_entity(const std::string &id_with_prefix) noexcept
Increment the touch counter for a single full ID (prefix REQUIRED).
static std::string entity_root(const std::string &id)
Extract the lexeme root from a full ID string.
std::string current_group
int touch_ids(std::span< const int > ids, corespace::entity_kind kind)
Batch variant of touch for numeric IDs.
static bool parse_id(const std::string &entity, size_t &pos, int &id)
Parse a full ID string and extract the numeric portion.
bool new_group(std::string name="")
Create or select a group and make it current.
size_t add_entity(const std::string &id_with_prefix, bool force=false, std::string name="")
Enqueue a full (prefixed) ID string and add it to a group.
std::unordered_map< std::string, std::unordered_set< std::string > > groups
std::chrono::milliseconds staleness_threshold
bool enqueue(std::string_view id, corespace::entity_kind kind, bool interactive) const
Decide whether an entity should be enqueued for fetching.
const size_t batch_threshold
Typical unauthenticated entity-per-request cap.
const int candidates_threshold
Intentional high bar for curiosity-driven candidates.
static std::string normalize(int id, corespace::entity_kind kind)
Normalize a numeric ID with the given kind to a prefixed string.
static bool ask_update(std::string_view id, corespace::entity_kind kind, std::chrono::milliseconds age)
Placeholder for interactive staleness confirmation.
void select_group(std::string name)
Select an existing group or create it on demand.
std::array< std::unordered_set< std::string >, batched_kind_count > main_batches
int queue_size(corespace::entity_kind kind) const noexcept
Get the number of queued (pending) entities tracked in the main batch containers.
static corespace::entity_kind identify(const std::string &entity) noexcept
Determine the kind of a full ID string.
bool flush(corespace::entity_kind kind=corespace::entity_kind::any)
Flush (send) up to batch_threshold entities of a specific kind.
size_t add_ids(std::span< const int > ids, corespace::entity_kind kind, std::string name="")
Enqueue numeric IDs with a given kind and add them to a group.
Batch courier for Wikidata/Commons: collects IDs, issues HTTP requests, and returns a merged JSON pay...
corespace::http_client client
Reused HTTP client (not thread-safe across threads).
static std::string join_str(std::span< const std::string > ids, std::string_view separator="|")
Join a span of strings with a separator (no encoding or validation).
const corespace::network_metrics & metrics_info() const
Access aggregated network metrics of the underlying client.
nlohmann::json fetch_json(const std::unordered_set< std::string > &batch, corespace::entity_kind kind=corespace::entity_kind::any)
Fetch metadata for a set of entity IDs and return a merged JSON object.
corespace::options opt
Request shaping parameters (chunking, fields, base params).
static bool status_retry(const http_response &response, bool net_ok)
Retry predicate for transient outcomes.
http_response post_form(std::string_view url, const parameter_list &form, const parameter_list &query={}, std::string_view override={})
std::unique_ptr< curl_slist, decltype(&curl_slist_free_all)> header_list
Owned request header list.
http_response request_post(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view content_type, std::string_view body, std::string_view override) const
void update_headers(http_response &response) const
Refresh the header multimap from the last transfer.
http_client()
Construct a client and initialize libcurl.
const network_metrics & metrics_info() const
Access aggregated network metrics.
network_metrics metrics
Aggregated metrics (atomic counters).
long long next_delay(int attempt) const
Compute the next backoff delay for attempt (1-based).
const network_options opt
Fixed options installed at construction.
http_response request_get(CURLU *url_handle, std::chrono::milliseconds &elapsed, std::string_view override={}) const
Execute a single HTTP GET using the prepared URL handle.
static curl_url_ptr build_url(std::string_view url, const parameter_list ¶ms)
Construct a CURLU handle from url and append params.
static bool status_good(const http_response &response)
Success predicate: transport OK and HTTP 2xx.
http_response post_raw(std::string_view url, std::string_view body, std::string_view content_type, const parameter_list &query={}, std::string_view override={})
void apply_server_retry_hint(long long &sleep_ms) const
Apply server-provided retry hint if present.
std::unique_ptr< CURLU, decltype(&curl_url_cleanup)> curl_url_ptr
Unique pointer type for CURLU with proper deleter.
std::string build_form_body(const parameter_list &form) const
void update_metrics(const http_response &response, std::chrono::milliseconds elapsed)
Update counters and histograms after an attempt.
std::unique_ptr< CURL, decltype(&curl_easy_cleanup)> curl
Reused easy handle (not thread-safe).
static size_t write_callback(const char *ptr, size_t size, size_t n, void *data)
libcurl write callback: append chunk to response body.
http_response get(std::string_view url, const parameter_list ¶ms={}, std::string_view override={})
Perform an HTTP GET to url with optional query params.
static constexpr std::string prefixes
constexpr std::size_t batched_kind_count
Number of batchable kinds (Q, P, L, M, E, form, sense).
entity_kind
Wikidata entity kind.
@ any
API selector (e.g., flush(any)); not directly batchable.
@ property
IDs prefixed with 'P'.
@ lexeme
IDs prefixed with 'L'.
@ form
Lexeme form IDs such as "L<lexeme>-F<form>".
@ item
IDs prefixed with 'Q'.
@ mediainfo
IDs prefixed with 'M'.
@ entity_schema
IDs prefixed with 'E'.
@ unknown
Unrecognized/invalid identifier.
@ sense
Lexeme sense IDs such as "L<lexeme>-S<sense>".
std::pair< std::string, std::string > parameter
Single query parameter: key=value (pre-encoding is handled by libcurl).
std::string random_hex(const std::size_t n)
Return exactly n random hexadecimal characters (lowercase).
Result object for an HTTP transfer.
std::string error_message
Non-empty on libcurl error.
std::string text
Response body accumulated across callbacks.
header_map header
Response headers from the final attempt.
size_t status_code
HTTP status code (e.g., 200, 404).
CURLcode error_code
libcurl transport/result code.
std::atomic< long long > network_ms
Total time spent in libcurl (ms).
network_metrics()
Zero-initialize per-status counters.
std::atomic< unsigned > requests
Finished attempts (success or failure).
std::atomic< long long > sleep_ms
Total backoff duration slept (ms).
std::atomic< unsigned > retries
Number of retry cycles triggered.
std::atomic< size_t > bytes_received
Sum of response body sizes (bytes).
std::array< std::atomic< unsigned >, 600 > statuses
Per-code histogram for HTTP 0..599.
Fixed runtime options for the HTTP client.
int retry_base_ms
Base for exponential backoff (ms).
int connect_ms
Connect timeout (ms).
int timeout_ms
Total request timeout (ms).
std::string accept
Default Accept header.
std::string user_agent
Default User-Agent.
int max_retries
Max retry attempts after the first try.
long long retry_max_ms
Max per-attempt backoff (ms).
Configuration for fetching entities via MediaWiki/Wikibase API.
std::vector< std::string > props
std::size_t batch_threshold
std::vector< std::string > prop