Arachne 1.0
Arachne - the perpetual stitcher of Wikidata entities.
Loading...
Searching...
No Matches
arachne.cpp
Go to the documentation of this file.
1/*
2 * The MIT License (MIT)
3 *
4 * Copyright (c) 2025 Yaroslav Riabtsev <yaroslav.riabtsev@rwth-aachen.de>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arachne.hpp"
26#include "rng.hpp"
27
28namespace arachnespace {
29static constexpr std::string prefixes = "QPLME";
30
31bool arachne::new_group(std::string name) {
32 if (name.empty()) {
33 do {
34 name = "g_" + corespace::random_hex(8);
35 } while (groups.contains(name));
36 }
37 auto [it, inserted] = groups.try_emplace(name);
38 current_group = it->first;
39 return inserted;
40}
41
43 const std::span<const int> ids, const corespace::entity_kind kind,
44 std::string name
45) {
48 throw std::invalid_argument("unknown kind of numeric IDs");
49 }
50 select_group(std::move(name));
51 size_t last_size = groups[current_group].size();
52 for (const int id : ids) {
53 std::string id_with_prefix = normalize(id, kind);
54 last_size = add_entity(id_with_prefix, false, current_group);
55 }
56 return last_size;
57}
58
60 const std::span<const int> ids, const corespace::entity_kind kind
61) {
64 throw std::invalid_argument("unknown kind of numeric IDs");
65 }
66 int added = 0;
67 for (const int id : ids) {
68 std::string id_with_prefix = normalize(id, kind);
69 added += touch_entity(id_with_prefix);
70 }
71 return added;
72}
73
74std::string arachne::entity_root(const std::string& id) {
75 const corespace::entity_kind kind = identify(id);
78 throw std::invalid_argument("invalid or unknown entity kind");
79 }
80
83 if (id.size() < 2 || id.front() != 'L') {
84 throw std::invalid_argument(
85 "bad root-lexeme prefix of the entity: " + id
86 );
87 }
88 int val {};
89 if (size_t pos = 1; !parse_id(id, pos, val)) {
90 throw std::invalid_argument(
91 "bad numeric identifier of the entity: " + id
92 );
93 }
94 return "L" + std::to_string(val);
95 }
96 return id;
97}
98
100 const auto& batch = main_batches[static_cast<size_t>(kind)];
101 const size_t size = batch.size();
102 auto data = phe_client.fetch_json(batch, kind);
103 // ariadne.store(data);
104 return size > batch.size();
105}
106
107int arachne::queue_size(const corespace::entity_kind kind) const noexcept {
108 if (kind == corespace::entity_kind::any) {
109 std::size_t sum = 0;
110 for (const auto& batch : main_batches) {
111 sum += batch.size();
112 }
113 return static_cast<int>(sum);
114 }
115 const auto idx = static_cast<std::size_t>(kind);
116 if (idx >= main_batches.size()) {
117 return 0;
118 }
119 return static_cast<int>(main_batches[idx].size());
120}
121
122corespace::entity_kind arachne::identify(const std::string& entity) noexcept {
123 if (entity.size() < 2) {
125 }
126 size_t pos = 0;
127 size_t kind = prefixes.find(entity[pos++]);
128 int id {};
129 if (kind == std::string::npos || !parse_id(entity, pos, id)) {
131 }
132 if (pos == entity.size()) {
133 return static_cast<corespace::entity_kind>(kind);
134 }
135 if (kind != static_cast<size_t>(corespace::entity_kind::lexeme)
136 || pos >= entity.size() || entity[pos++] != '-'
137 || pos >= entity.size()) {
139 }
140 const char tag = entity[pos++];
141 if (tag != 'F' && tag != 'S' || !parse_id(entity, pos, id)
142 || pos != entity.size()) {
144 }
145 return tag == 'F' ? corespace::entity_kind::form
147}
148
149bool arachne::parse_id(const std::string& entity, size_t& pos, int& id) {
150 id = 0;
151 size_t len = 0;
152 try {
153 id = std::stoi(entity.substr(pos), &len);
154 } catch (...) {
155 return false;
156 }
157 if (id < 0 || len == 0 || std::to_string(id).size() != len) {
158 return false;
159 }
160 pos += len;
161 return true;
162}
163
164std::string
165arachne::normalize(const int id, const corespace::entity_kind kind) {
166 if (id < 0) {
167 throw std::invalid_argument("normalize: id must be non-negative");
168 }
171 throw std::invalid_argument(
172 "normalize: kind must be a concrete, known entity kind"
173 );
174 }
175 auto idx = static_cast<std::size_t>(kind);
176 if (idx >= static_cast<size_t>(corespace::entity_kind::form)) {
177 // Numeric Form/Sense are not representable; map to lexeme.
178 // TODO: emit warning via logging sink.
179 idx = static_cast<size_t>(corespace::entity_kind::lexeme);
180 }
181 return prefixes[idx] + std::to_string(id);
182}
183
184void arachne::select_group(std::string name) {
185 if (name.empty()) {
186 if (current_group.empty()) {
188 }
189 return;
190 }
191 new_group(std::move(name));
192}
193
194bool arachne::ask_update(
195 std::string_view, corespace::entity_kind, const std::chrono::milliseconds
196) {
197 // UI/UX: todo: ask user if update is needed
198 return false;
199}
200
202 const std::string_view id, const corespace::entity_kind kind,
203 const bool interactive
204) const {
205 // ariadne.entity_status(id)
206 auto [exist, last] = std::pair<bool, long long>(false, -1);
207 if (!exist || last < 0) {
208 return true;
209 }
210 const auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
211 std::chrono::system_clock::now().time_since_epoch()
212 )
213 .count();
214 const auto age = std::chrono::milliseconds { now_ms - last };
215 if (age > staleness_threshold) {
216 return true;
217 }
218 if (interactive) {
219 return ask_update(id, kind, age);
220 }
221 return false;
222}
223
224bool arachne::touch_entity(const std::string& id_with_prefix) noexcept {
225 candidates[id_with_prefix]++;
226 if (candidates[id_with_prefix] >= candidates_threshold) {
227 const std::string canonical = entity_root(id_with_prefix);
228 corespace::entity_kind kind = identify(canonical);
229 extra_batches[static_cast<size_t>(kind)].insert(canonical);
230 return true;
231 }
232 return false;
233}
234
236 const std::string& id_with_prefix, const bool force, std::string name
237) {
238 const std::string canonical = entity_root(id_with_prefix);
239 select_group(std::move(name));
240 auto& group = groups[current_group];
241 group.insert(id_with_prefix);
242 if (corespace::entity_kind kind = identify(canonical); force
243 || enqueue(canonical, kind, ui == corespace::interface::command_line)) {
244 auto& pool = main_batches[static_cast<size_t>(kind)];
245 pool.insert(canonical);
246 if (pool.size() >= batch_threshold) {
247 flush(kind);
248 }
249 }
250 return group.size();
251}
252}
Accumulates entity IDs into per-kind batches and organizes groups.
Definition arachne.hpp:47
bool touch_entity(const std::string &id_with_prefix) noexcept
Increment the touch counter for a single full ID (prefix REQUIRED).
Definition arachne.cpp:224
static std::string entity_root(const std::string &id)
Extract the lexeme root from a full ID string.
Definition arachne.cpp:74
std::string current_group
Definition arachne.hpp:290
int touch_ids(std::span< const int > ids, corespace::entity_kind kind)
Batch variant of touch for numeric IDs.
Definition arachne.cpp:59
bool new_group(std::string name="")
Create or select a group and make it current.
Definition arachne.cpp:31
size_t add_entity(const std::string &id_with_prefix, bool force=false, std::string name="")
Enqueue a full (prefixed) ID string and add it to a group.
Definition arachne.cpp:235
bool enqueue(std::string_view id, corespace::entity_kind kind, bool interactive) const
Decide whether an entity should be enqueued for fetching.
Definition arachne.cpp:201
static std::string normalize(int id, corespace::entity_kind kind)
Normalize a numeric ID with the given kind to a prefixed string.
Definition arachne.cpp:165
void select_group(std::string name)
Select an existing group or create it on demand.
Definition arachne.cpp:184
int queue_size(corespace::entity_kind kind) const noexcept
Get the number of queued (pending) entities tracked in the main batch containers.
Definition arachne.cpp:107
static corespace::entity_kind identify(const std::string &entity) noexcept
Determine the kind of a full ID string.
Definition arachne.cpp:122
bool flush(corespace::entity_kind kind=corespace::entity_kind::any)
Flush (send) up to batch_threshold entities of a specific kind.
Definition arachne.cpp:99
size_t add_ids(std::span< const int > ids, corespace::entity_kind kind, std::string name="")
Enqueue numeric IDs with a given kind and add them to a group.
Definition arachne.cpp:42
static constexpr std::string prefixes
Definition arachne.cpp:29
entity_kind
Wikidata entity kind.
Definition utils.hpp:46
@ any
API selector (e.g., flush(any)); not directly batchable.
Definition utils.hpp:54
@ lexeme
IDs prefixed with 'L'.
Definition utils.hpp:49
@ form
Lexeme form IDs such as "L<lexeme>-F<form>".
Definition utils.hpp:52
@ unknown
Unrecognized/invalid identifier.
Definition utils.hpp:55
@ sense
Lexeme sense IDs such as "L<lexeme>-S<sense>".
Definition utils.hpp:53
std::string random_hex(const std::size_t n)
Return exactly n random hexadecimal characters (lowercase).
Definition rng.cpp:33