Ada 3.4.3
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include <cstdint>
2#include <cstring>
3#include <sstream>
4
5#include "ada/checkers-inl.h"
6#include "ada/common_defs.h"
7#include "ada/helpers.h"
8#include "ada/scheme.h"
9#include "ada/state.h"
10
11#if ADA_SSSE3
12#include <tmmintrin.h>
13#endif
14
15namespace ada::helpers {
16
17template <typename out_iter>
18void encode_json(std::string_view view, out_iter out) {
19 // trivial implementation. could be faster.
20 const char* hexvalues =
21 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
22 for (uint8_t c : view) {
23 if (c == '\\') {
24 *out++ = '\\';
25 *out++ = '\\';
26 } else if (c == '"') {
27 *out++ = '\\';
28 *out++ = '"';
29 } else if (c <= 0x1f) {
30 *out++ = '\\';
31 *out++ = 'u';
32 *out++ = '0';
33 *out++ = '0';
34 *out++ = hexvalues[2 * c];
35 *out++ = hexvalues[2 * c + 1];
36 } else {
37 *out++ = c;
38 }
39 }
40}
41
43 switch (s) {
45 return "Authority";
47 return "Scheme Start";
49 return "Scheme";
51 return "Host";
53 return "No Scheme";
55 return "Fragment";
57 return "Relative Scheme";
59 return "Relative Slash";
61 return "File";
63 return "File Host";
65 return "File Slash";
67 return "Path or Authority";
69 return "Special Authority Ignore Slashes";
71 return "Special Authority Slashes";
73 return "Special Relative or Authority";
75 return "Query";
77 return "Path";
79 return "Path Start";
81 return "Opaque Path";
83 return "Port";
84 default:
85 return "unknown state";
86 }
87}
88
89ada_really_inline std::optional<std::string_view> prune_hash(
90 std::string_view& input) noexcept {
91 // compiles down to 20--30 instructions including a class to memchr (C
92 // function). this function should be quite fast.
93 size_t location_of_first = input.find('#');
94 if (location_of_first == std::string_view::npos) {
95 return std::nullopt;
96 }
97 std::string_view hash = input;
98 hash.remove_prefix(location_of_first + 1);
99 input.remove_suffix(input.size() - location_of_first);
100 return hash;
101}
102
103ada_really_inline bool shorten_path(std::string& path, ada::scheme::type type) {
104 // Let path be url's path.
105 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
106 // Windows drive letter, then return.
107 if (type == ada::scheme::type::FILE &&
108 path.find('/', 1) == std::string_view::npos && !path.empty()) {
110 helpers::substring(path, 1))) {
111 return false;
112 }
113 }
114
115 // Remove path's last item, if any.
116 size_t last_delimiter = path.rfind('/');
117 if (last_delimiter != std::string::npos) {
118 path.erase(last_delimiter);
119 return true;
120 }
121
122 return false;
123}
124
125ada_really_inline bool shorten_path(std::string_view& path,
126 ada::scheme::type type) {
127 // Let path be url's path.
128 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
129 // Windows drive letter, then return.
130 if (type == ada::scheme::type::FILE &&
131 path.find('/', 1) == std::string_view::npos && !path.empty()) {
133 helpers::substring(path, 1))) {
134 return false;
135 }
136 }
137
138 // Remove path's last item, if any.
139 if (!path.empty()) {
140 size_t slash_loc = path.rfind('/');
141 if (slash_loc != std::string_view::npos) {
142 path.remove_suffix(path.size() - slash_loc);
143 return true;
144 }
145 }
146
147 return false;
148}
149
150ada_really_inline void remove_ascii_tab_or_newline(std::string& input) {
151 // if this ever becomes a performance issue, we could use an approach similar
152 // to has_tabs_or_newline
153 std::erase_if(input, ada::unicode::is_ascii_tab_or_newline);
154}
155
156ada_really_inline constexpr std::string_view substring(std::string_view input,
157 size_t pos) {
158 ADA_ASSERT_TRUE(pos <= input.size());
159 // The following is safer but unneeded if we have the above line:
160 // return pos > input.size() ? std::string_view() : input.substr(pos);
161 return input.substr(pos);
162}
163
164ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
165 ADA_ASSERT_TRUE(pos <= input.size());
166 input.remove_suffix(input.size() - pos);
167}
168
169// computes the number of trailing zeroes
170// this is a private inline function only defined in this source file.
171ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
172#ifdef ADA_REGULAR_VISUAL_STUDIO
173 unsigned long ret;
174 // Search the mask data from least significant bit (LSB)
175 // to the most significant bit (MSB) for a set bit (1).
176 _BitScanForward(&ret, input_num);
177 return (int)ret;
178#else // ADA_REGULAR_VISUAL_STUDIO
179 return __builtin_ctzl(input_num);
180#endif // ADA_REGULAR_VISUAL_STUDIO
181}
182
183// starting at index location, this finds the next location of a character
184// :, /, \\, ? or [. If none is found, view.size() is returned.
185// For use within get_host_delimiter_location.
186#if ADA_SSSE3
188 std::string_view view, size_t location) noexcept {
189 // first check for short strings in which case we do it naively.
190 if (view.size() - location < 16) { // slow path
191 for (size_t i = location; i < view.size(); i++) {
192 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
193 view[i] == '?' || view[i] == '[') {
194 return i;
195 }
196 }
197 return size_t(view.size());
198 }
199 // fast path for long strings (expected to be common)
200 // Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
201 size_t i = location;
202 const __m128i low_mask =
203 _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
204 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
205 const __m128i high_mask =
206 _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
207 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
208 const __m128i fmask = _mm_set1_epi8(0xf);
209 const __m128i zero = _mm_setzero_si128();
210 for (; i + 15 < view.size(); i += 16) {
211 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
212 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
213 __m128i highpart = _mm_shuffle_epi8(
214 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
215 __m128i classify = _mm_and_si128(lowpart, highpart);
216 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
217 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
218 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
219 // avoid false positives.
220 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
221 if (mask != 0) {
222 return i + trailing_zeroes(static_cast<uint32_t>(mask));
223 }
224 }
225 if (i < view.size()) {
226 __m128i word =
227 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
228 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
229 __m128i highpart = _mm_shuffle_epi8(
230 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
231 __m128i classify = _mm_and_si128(lowpart, highpart);
232 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
233 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
234 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
235 // avoid false positives.
236 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
237 if (mask != 0) {
238 return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
239 }
240 }
241 return size_t(view.size());
242}
243#elif ADA_NEON
244// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
245// support direct initialization of uint8x16_t. See
246// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
247#ifndef ada_make_uint8x16_t
248#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
249 x13, x14, x15, x16) \
250 ([=]() { \
251 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
252 x9, x10, x11, x12, x13, x14, x15, x16}; \
253 return vld1q_u8(array); \
254 }())
255#endif
256
258 std::string_view view, size_t location) noexcept {
259 // first check for short strings in which case we do it naively.
260 if (view.size() - location < 16) { // slow path
261 for (size_t i = location; i < view.size(); i++) {
262 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
263 view[i] == '?' || view[i] == '[') {
264 return i;
265 }
266 }
267 return size_t(view.size());
268 }
269 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
270 uint8x16_t bit_mask =
271 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
272 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
273 uint8x16_t minput = vandq_u8(input, bit_mask);
274 uint8x16_t tmp = vpaddq_u8(minput, minput);
275 tmp = vpaddq_u8(tmp, tmp);
276 tmp = vpaddq_u8(tmp, tmp);
277 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
278 };
279
280 // fast path for long strings (expected to be common)
281 size_t i = location;
282 uint8x16_t low_mask =
283 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
284 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
285 uint8x16_t high_mask =
286 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
287 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
288 uint8x16_t fmask = vmovq_n_u8(0xf);
289 uint8x16_t zero{0};
290 for (; i + 15 < view.size(); i += 16) {
291 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
292 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
293 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
294 uint8x16_t classify = vandq_u8(lowpart, highpart);
295 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
296 uint8x16_t is_zero = vceqq_u8(classify, zero);
297 uint16_t is_non_zero = ~to_bitmask(is_zero);
298 return i + trailing_zeroes(is_non_zero);
299 }
300 }
301
302 if (i < view.size()) {
303 uint8x16_t word =
304 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
305 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
306 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
307 uint8x16_t classify = vandq_u8(lowpart, highpart);
308 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
309 uint8x16_t is_zero = vceqq_u8(classify, zero);
310 uint16_t is_non_zero = ~to_bitmask(is_zero);
311 return view.length() - 16 + trailing_zeroes(is_non_zero);
312 }
313 }
314 return size_t(view.size());
315}
316#elif ADA_SSE2
318 std::string_view view, size_t location) noexcept {
319 // first check for short strings in which case we do it naively.
320 if (view.size() - location < 16) { // slow path
321 for (size_t i = location; i < view.size(); i++) {
322 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
323 view[i] == '?' || view[i] == '[') {
324 return i;
325 }
326 }
327 return size_t(view.size());
328 }
329 // fast path for long strings (expected to be common)
330 size_t i = location;
331 const __m128i mask1 = _mm_set1_epi8(':');
332 const __m128i mask2 = _mm_set1_epi8('/');
333 const __m128i mask3 = _mm_set1_epi8('\\');
334 const __m128i mask4 = _mm_set1_epi8('?');
335 const __m128i mask5 = _mm_set1_epi8('[');
336
337 for (; i + 15 < view.size(); i += 16) {
338 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
339 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
340 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
341 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
342 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
343 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
344 __m128i m = _mm_or_si128(
345 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
346 int mask = _mm_movemask_epi8(m);
347 if (mask != 0) {
348 return i + trailing_zeroes(mask);
349 }
350 }
351 if (i < view.size()) {
352 __m128i word =
353 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
354 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
355 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
356 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
357 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
358 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
359 __m128i m = _mm_or_si128(
360 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
361 int mask = _mm_movemask_epi8(m);
362 if (mask != 0) {
363 return view.length() - 16 + trailing_zeroes(mask);
364 }
365 }
366 return size_t(view.length());
367}
368#elif ADA_LSX
370 std::string_view view, size_t location) noexcept {
371 // first check for short strings in which case we do it naively.
372 if (view.size() - location < 16) { // slow path
373 for (size_t i = location; i < view.size(); i++) {
374 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
375 view[i] == '?' || view[i] == '[') {
376 return i;
377 }
378 }
379 return size_t(view.size());
380 }
381 // fast path for long strings (expected to be common)
382 size_t i = location;
383 const __m128i mask1 = __lsx_vrepli_b(':');
384 const __m128i mask2 = __lsx_vrepli_b('/');
385 const __m128i mask3 = __lsx_vrepli_b('\\');
386 const __m128i mask4 = __lsx_vrepli_b('?');
387 const __m128i mask5 = __lsx_vrepli_b('[');
388
389 for (; i + 15 < view.size(); i += 16) {
390 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
391 __m128i m1 = __lsx_vseq_b(word, mask1);
392 __m128i m2 = __lsx_vseq_b(word, mask2);
393 __m128i m3 = __lsx_vseq_b(word, mask3);
394 __m128i m4 = __lsx_vseq_b(word, mask4);
395 __m128i m5 = __lsx_vseq_b(word, mask5);
396 __m128i m =
397 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
398 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
399 if (mask != 0) {
400 return i + trailing_zeroes(mask);
401 }
402 }
403 if (i < view.size()) {
404 __m128i word =
405 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
406 __m128i m1 = __lsx_vseq_b(word, mask1);
407 __m128i m2 = __lsx_vseq_b(word, mask2);
408 __m128i m3 = __lsx_vseq_b(word, mask3);
409 __m128i m4 = __lsx_vseq_b(word, mask4);
410 __m128i m5 = __lsx_vseq_b(word, mask5);
411 __m128i m =
412 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
413 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
414 if (mask != 0) {
415 return view.length() - 16 + trailing_zeroes(mask);
416 }
417 }
418 return size_t(view.length());
419}
420#elif ADA_RVV
422 std::string_view view, size_t location) noexcept {
423 // The LUT approach was a bit slower on the SpacemiT X60, but I could see it
424 // being faster on future hardware.
425#if 0
426 // LUT generated using: s=":/\\?["; list(zip([((ord(c)>>2)&0xF)for c in s],s))
427 static const uint8_t tbl[16] = {
428 0xF, 0, 0, 0, 0, 0, '[', '\\', 0, 0, 0, '/', 0, 0, ':', '?'
429 };
430 vuint8m1_t vtbl = __riscv_vle8_v_u8m1(tbl, 16);
431#endif
432 uint8_t* src = (uint8_t*)view.data() + location;
433 for (size_t vl, n = view.size() - location; n > 0;
434 n -= vl, src += vl, location += vl) {
435 vl = __riscv_vsetvl_e8m1(n);
436 vuint8m1_t v = __riscv_vle8_v_u8m1(src, vl);
437#if 0
438 vuint8m1_t vidx = __riscv_vand(__riscv_vsrl(v, 2, vl), 0xF, vl);
439 vuint8m1_t vlut = __riscv_vrgather(vtbl, vidx, vl);
440 vbool8_t m = __riscv_vmseq(v, vlut, vl);
441#else
442 vbool8_t m1 = __riscv_vmseq(v, ':', vl);
443 vbool8_t m2 = __riscv_vmseq(v, '/', vl);
444 vbool8_t m3 = __riscv_vmseq(v, '?', vl);
445 vbool8_t m4 = __riscv_vmseq(v, '[', vl);
446 vbool8_t m5 = __riscv_vmseq(v, '\\', vl);
447 vbool8_t m = __riscv_vmor(
448 __riscv_vmor(__riscv_vmor(m1, m2, vl), __riscv_vmor(m3, m4, vl), vl),
449 m5, vl);
450#endif
451 long idx = __riscv_vfirst(m, vl);
452 if (idx >= 0) return location + idx;
453 }
454 return size_t(view.size());
455}
456#else
457// : / [ \\ ?
458static constexpr std::array<uint8_t, 256> special_host_delimiters =
459 []() consteval {
460 std::array<uint8_t, 256> result{};
461 for (int i : {':', '/', '[', '\\', '?'}) {
462 result[i] = 1;
463 }
464 return result;
465 }();
466// credit: @the-moisrex recommended a table-based approach
468 std::string_view view, size_t location) noexcept {
469 auto const str = view.substr(location);
470 for (auto pos = str.begin(); pos != str.end(); ++pos) {
471 if (special_host_delimiters[(uint8_t)*pos]) {
472 return pos - str.begin() + location;
473 }
474 }
475 return size_t(view.size());
476}
477#endif
478
479// starting at index location, this finds the next location of a character
480// :, /, ? or [. If none is found, view.size() is returned.
481// For use within get_host_delimiter_location.
482#if ADA_SSSE3
483ada_really_inline size_t find_next_host_delimiter(std::string_view view,
484 size_t location) noexcept {
485 // first check for short strings in which case we do it naively.
486 if (view.size() - location < 16) { // slow path
487 for (size_t i = location; i < view.size(); i++) {
488 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
489 view[i] == '[') {
490 return i;
491 }
492 }
493 return size_t(view.size());
494 }
495 // fast path for long strings (expected to be common)
496 size_t i = location;
497 // Lookup tables for bit classification:
498 // ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match
499 // '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match
500 // '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match
501 // '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match
502 const __m128i low_mask =
503 _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
504 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
505 const __m128i high_mask =
506 _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
507 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
508 const __m128i fmask = _mm_set1_epi8(0xf);
509 const __m128i zero = _mm_setzero_si128();
510
511 for (; i + 15 < view.size(); i += 16) {
512 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
513 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
514 __m128i highpart = _mm_shuffle_epi8(
515 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
516 __m128i classify = _mm_and_si128(lowpart, highpart);
517 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
518 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
519 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
520 // avoid false positives.
521 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
522 if (mask != 0) {
523 return i + trailing_zeroes(static_cast<uint32_t>(mask));
524 }
525 }
526
527 if (i < view.size()) {
528 __m128i word =
529 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
530 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
531 __m128i highpart = _mm_shuffle_epi8(
532 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
533 __m128i classify = _mm_and_si128(lowpart, highpart);
534 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
535 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
536 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
537 // avoid false positives.
538 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
539 if (mask != 0) {
540 return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
541 }
542 }
543 return size_t(view.size());
544}
545#elif ADA_NEON
546ada_really_inline size_t find_next_host_delimiter(std::string_view view,
547 size_t location) noexcept {
548 // first check for short strings in which case we do it naively.
549 if (view.size() - location < 16) { // slow path
550 for (size_t i = location; i < view.size(); i++) {
551 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
552 view[i] == '[') {
553 return i;
554 }
555 }
556 return size_t(view.size());
557 }
558 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
559 uint8x16_t bit_mask =
560 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
561 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
562 uint8x16_t minput = vandq_u8(input, bit_mask);
563 uint8x16_t tmp = vpaddq_u8(minput, minput);
564 tmp = vpaddq_u8(tmp, tmp);
565 tmp = vpaddq_u8(tmp, tmp);
566 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
567 };
568
569 // fast path for long strings (expected to be common)
570 size_t i = location;
571 uint8x16_t low_mask =
572 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
573 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
574 uint8x16_t high_mask =
575 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
576 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
577 uint8x16_t fmask = vmovq_n_u8(0xf);
578 uint8x16_t zero{0};
579 for (; i + 15 < view.size(); i += 16) {
580 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
581 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
582 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
583 uint8x16_t classify = vandq_u8(lowpart, highpart);
584 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
585 uint8x16_t is_zero = vceqq_u8(classify, zero);
586 uint16_t is_non_zero = ~to_bitmask(is_zero);
587 return i + trailing_zeroes(is_non_zero);
588 }
589 }
590
591 if (i < view.size()) {
592 uint8x16_t word =
593 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
594 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
595 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
596 uint8x16_t classify = vandq_u8(lowpart, highpart);
597 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
598 uint8x16_t is_zero = vceqq_u8(classify, zero);
599 uint16_t is_non_zero = ~to_bitmask(is_zero);
600 return view.length() - 16 + trailing_zeroes(is_non_zero);
601 }
602 }
603 return size_t(view.size());
604}
605#elif ADA_SSE2
606ada_really_inline size_t find_next_host_delimiter(std::string_view view,
607 size_t location) noexcept {
608 // first check for short strings in which case we do it naively.
609 if (view.size() - location < 16) { // slow path
610 for (size_t i = location; i < view.size(); i++) {
611 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
612 view[i] == '[') {
613 return i;
614 }
615 }
616 return size_t(view.size());
617 }
618 // fast path for long strings (expected to be common)
619 size_t i = location;
620 const __m128i mask1 = _mm_set1_epi8(':');
621 const __m128i mask2 = _mm_set1_epi8('/');
622 const __m128i mask4 = _mm_set1_epi8('?');
623 const __m128i mask5 = _mm_set1_epi8('[');
624
625 for (; i + 15 < view.size(); i += 16) {
626 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
627 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
628 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
629 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
630 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
631 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
632 int mask = _mm_movemask_epi8(m);
633 if (mask != 0) {
634 return i + trailing_zeroes(mask);
635 }
636 }
637 if (i < view.size()) {
638 __m128i word =
639 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
640 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
641 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
642 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
643 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
644 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
645 int mask = _mm_movemask_epi8(m);
646 if (mask != 0) {
647 return view.length() - 16 + trailing_zeroes(mask);
648 }
649 }
650 return size_t(view.length());
651}
652#elif ADA_LSX
653ada_really_inline size_t find_next_host_delimiter(std::string_view view,
654 size_t location) noexcept {
655 // first check for short strings in which case we do it naively.
656 if (view.size() - location < 16) { // slow path
657 for (size_t i = location; i < view.size(); i++) {
658 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
659 view[i] == '[') {
660 return i;
661 }
662 }
663 return size_t(view.size());
664 }
665 // fast path for long strings (expected to be common)
666 size_t i = location;
667 const __m128i mask1 = __lsx_vrepli_b(':');
668 const __m128i mask2 = __lsx_vrepli_b('/');
669 const __m128i mask4 = __lsx_vrepli_b('?');
670 const __m128i mask5 = __lsx_vrepli_b('[');
671
672 for (; i + 15 < view.size(); i += 16) {
673 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
674 __m128i m1 = __lsx_vseq_b(word, mask1);
675 __m128i m2 = __lsx_vseq_b(word, mask2);
676 __m128i m4 = __lsx_vseq_b(word, mask4);
677 __m128i m5 = __lsx_vseq_b(word, mask5);
678 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
679 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
680 if (mask != 0) {
681 return i + trailing_zeroes(mask);
682 }
683 }
684 if (i < view.size()) {
685 __m128i word =
686 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
687 __m128i m1 = __lsx_vseq_b(word, mask1);
688 __m128i m2 = __lsx_vseq_b(word, mask2);
689 __m128i m4 = __lsx_vseq_b(word, mask4);
690 __m128i m5 = __lsx_vseq_b(word, mask5);
691 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
692 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
693 if (mask != 0) {
694 return view.length() - 16 + trailing_zeroes(mask);
695 }
696 }
697 return size_t(view.length());
698}
699#elif ADA_RVV
700ada_really_inline size_t find_next_host_delimiter(std::string_view view,
701 size_t location) noexcept {
702 uint8_t* src = (uint8_t*)view.data() + location;
703 for (size_t vl, n = view.size() - location; n > 0;
704 n -= vl, src += vl, location += vl) {
705 vl = __riscv_vsetvl_e8m1(n);
706 vuint8m1_t v = __riscv_vle8_v_u8m1(src, vl);
707 vbool8_t m1 = __riscv_vmseq(v, ':', vl);
708 vbool8_t m2 = __riscv_vmseq(v, '/', vl);
709 vbool8_t m3 = __riscv_vmseq(v, '?', vl);
710 vbool8_t m4 = __riscv_vmseq(v, '[', vl);
711 vbool8_t m =
712 __riscv_vmor(__riscv_vmor(m1, m2, vl), __riscv_vmor(m3, m4, vl), vl);
713 long idx = __riscv_vfirst(m, vl);
714 if (idx >= 0) return location + idx;
715 }
716 return size_t(view.size());
717}
718#else
719// : / [ ?
720static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
721 std::array<uint8_t, 256> result{};
722 for (int i : {':', '/', '?', '['}) {
723 result[i] = 1;
724 }
725 return result;
726}();
727// credit: @the-moisrex recommended a table-based approach
728ada_really_inline size_t find_next_host_delimiter(std::string_view view,
729 size_t location) noexcept {
730 auto const str = view.substr(location);
731 for (auto pos = str.begin(); pos != str.end(); ++pos) {
732 if (host_delimiters[(uint8_t)*pos]) {
733 return pos - str.begin() + location;
734 }
735 }
736 return size_t(view.size());
737}
738#endif
739
740ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
741 const bool is_special, std::string_view& view) noexcept {
750 const size_t view_size = view.size();
751 size_t location = 0;
752 bool found_colon = false;
772 if (is_special) {
773 // We move to the next delimiter.
774 location = find_next_host_delimiter_special(view, location);
775 // Unless we find '[' then we are going only going to have to call
776 // find_next_host_delimiter_special once.
777 for (; location < view_size;
778 location = find_next_host_delimiter_special(view, location)) {
779 if (view[location] == '[') {
780 location = view.find(']', location);
781 if (location == std::string_view::npos) {
782 // performance: view.find might get translated to a memchr, which
783 // has no notion of std::string_view::npos, so the code does not
784 // reflect the assembly.
785 location = view_size;
786 break;
787 }
788 } else {
789 found_colon = view[location] == ':';
790 break;
791 }
792 }
793 } else {
794 // We move to the next delimiter.
795 location = find_next_host_delimiter(view, location);
796 // Unless we find '[' then we are going only going to have to call
797 // find_next_host_delimiter_special once.
798 for (; location < view_size;
799 location = find_next_host_delimiter(view, location)) {
800 if (view[location] == '[') {
801 location = view.find(']', location);
802 if (location == std::string_view::npos) {
803 // performance: view.find might get translated to a memchr, which
804 // has no notion of std::string_view::npos, so the code does not
805 // reflect the assembly.
806 location = view_size;
807 break;
808 }
809 } else {
810 found_colon = view[location] == ':';
811 break;
812 }
813 }
814 }
815 // performance: remove_suffix may translate into a single instruction.
816 view.remove_suffix(view_size - location);
817 return {location, found_colon};
818}
819
820void trim_c0_whitespace(std::string_view& input) noexcept {
821 while (!input.empty() &&
822 ada::unicode::is_c0_control_or_space(input.front())) {
823 input.remove_prefix(1);
824 }
825 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
826 input.remove_suffix(1);
827 }
828}
829
830ada_really_inline void parse_prepared_path(std::string_view input,
832 std::string& path) {
833 ada_log("parse_prepared_path ", input);
834 uint8_t accumulator = checkers::path_signature(input);
835 // Let us first detect a trivial case.
836 // If it is special, we check that we have no dot, no %, no \ and no
837 // character needing percent encoding. Otherwise, we check that we have no %,
838 // no dot, and no character needing percent encoding.
839 constexpr uint8_t need_encoding = 1;
840 constexpr uint8_t backslash_char = 2;
841 constexpr uint8_t dot_char = 4;
842 constexpr uint8_t percent_char = 8;
843 bool special = type != ada::scheme::NOT_SPECIAL;
844 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
846 bool trivial_path =
847 (special ? (accumulator == 0)
848 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
849 0)) &&
850 (!may_need_slow_file_handling);
851 if (accumulator == dot_char && !may_need_slow_file_handling) {
852 // '4' means that we have at least one dot, but nothing that requires
853 // percent encoding or decoding. The only part that is not trivial is
854 // that we may have single dots and double dots path segments.
855 // If we have such segments, then we either have a path that begins
856 // with '.' (easy to check), or we have the sequence './'.
857 // Note: input cannot be empty, it must at least contain one character ('.')
858 // Note: we know that '\' is not present.
859 if (input[0] != '.') {
860 size_t slashdot = 0;
861 bool dot_is_file = true;
862 for (;;) {
863 slashdot = input.find("/.", slashdot);
864 if (slashdot == std::string_view::npos) { // common case
865 break;
866 } else { // uncommon
867 // only three cases matter: /./, /.. or a final /
868 slashdot += 2;
869 dot_is_file &= !(slashdot == input.size() || input[slashdot] == '.' ||
870 input[slashdot] == '/');
871 }
872 }
873 trivial_path = dot_is_file;
874 }
875 }
876 if (trivial_path) {
877 ada_log("parse_path trivial");
878 path += '/';
879 path += input;
880 return;
881 }
882 // We are going to need to look a bit at the path, but let us see if we can
883 // ignore percent encoding *and* backslashes *and* percent characters.
884 // Except for the trivial case, this is likely to capture 99% of paths out
885 // there.
886 bool fast_path =
887 (special &&
888 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
889 (type != ada::scheme::type::FILE);
890 if (fast_path) {
891 ada_log("parse_prepared_path fast");
892 // Here we don't need to worry about \ or percent encoding.
893 // We also do not have a file protocol. We might have dots, however,
894 // but dots must as appear as '.', and they cannot be encoded because
895 // the symbol '%' is not present.
896 size_t previous_location = 0; // We start at 0.
897 do {
898 size_t new_location = input.find('/', previous_location);
899 // std::string_view path_view = input;
900 // We process the last segment separately:
901 if (new_location == std::string_view::npos) {
902 std::string_view path_view = input.substr(previous_location);
903 if (path_view == "..") { // The path ends with ..
904 // e.g., if you receive ".." with an empty path, you go to "/".
905 if (path.empty()) {
906 path = '/';
907 return;
908 }
909 // Fast case where we have nothing to do:
910 if (path.back() == '/') {
911 return;
912 }
913 // If you have the path "/joe/myfriend",
914 // then you delete 'myfriend'.
915 path.resize(path.rfind('/') + 1);
916 return;
917 }
918 path += '/';
919 if (path_view != ".") {
920 path.append(path_view);
921 }
922 return;
923 } else {
924 // This is a non-final segment.
925 std::string_view path_view =
926 input.substr(previous_location, new_location - previous_location);
927 previous_location = new_location + 1;
928 if (path_view == "..") {
929 size_t last_delimiter = path.rfind('/');
930 if (last_delimiter != std::string::npos) {
931 path.erase(last_delimiter);
932 }
933 } else if (path_view != ".") {
934 path += '/';
935 path.append(path_view);
936 }
937 }
938 } while (true);
939 } else {
940 ada_log("parse_path slow");
941 // we have reached the general case
942 bool needs_percent_encoding = (accumulator & 1);
943 std::string path_buffer_tmp;
944 do {
945 size_t location = (special && (accumulator & 2))
946 ? input.find_first_of("/\\")
947 : input.find('/');
948 std::string_view path_view = input;
949 if (location != std::string_view::npos) {
950 path_view.remove_suffix(path_view.size() - location);
951 input.remove_prefix(location + 1);
952 }
953 // path_buffer is either path_view or it might point at a percent encoded
954 // temporary file.
955 std::string_view path_buffer =
956 (needs_percent_encoding &&
957 ada::unicode::percent_encode<false>(
958 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
959 ? path_buffer_tmp
960 : path_view;
961 if (unicode::is_double_dot_path_segment(path_buffer)) {
962 helpers::shorten_path(path, type);
963 if (location == std::string_view::npos) {
964 path += '/';
965 }
966 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
967 (location == std::string_view::npos)) {
968 path += '/';
969 }
970 // Otherwise, if path_buffer is not a single-dot path segment, then:
971 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
972 // If url's scheme is "file", url's path is empty, and path_buffer is a
973 // Windows drive letter, then replace the second code point in
974 // path_buffer with U+003A (:).
975 if (type == ada::scheme::type::FILE && path.empty() &&
977 path += '/';
978 path += path_buffer[0];
979 path += ':';
980 path_buffer.remove_prefix(2);
981 path.append(path_buffer);
982 } else {
983 // Append path_buffer to url's path.
984 path += '/';
985 path.append(path_buffer);
986 }
987 }
988 if (location == std::string_view::npos) {
989 return;
990 }
991 } while (true);
992 }
993}
994
995bool overlaps(std::string_view input1, const std::string& input2) noexcept {
996 ada_log("helpers::overlaps check if string_view '", input1, "' [",
997 input1.size(), " bytes] is part of string '", input2, "' [",
998 input2.size(), " bytes]");
999 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
1000 input1.data() < input2.data() + input2.size();
1001}
1002
1003template <class url_type>
1004ada_really_inline void strip_trailing_spaces_from_opaque_path(url_type& url) {
1005 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
1006 if (!url.has_opaque_path) return;
1007 if (url.has_hash()) return;
1008 if (url.has_search()) return;
1009
1010 auto path = std::string(url.get_pathname());
1011 while (!path.empty() && path.back() == ' ') {
1012 path.resize(path.size() - 1);
1013 }
1014 url.update_base_pathname(path);
1015}
1016
1017// @ / \\ ?
1018static constexpr std::array<uint8_t, 256> authority_delimiter_special =
1019 []() consteval {
1020 std::array<uint8_t, 256> result{};
1021 for (uint8_t i : {'@', '/', '\\', '?'}) {
1022 result[i] = 1;
1023 }
1024 return result;
1025 }();
1026// credit: @the-moisrex recommended a table-based approach
1027ada_really_inline size_t
1028find_authority_delimiter_special(std::string_view view) noexcept {
1029 // performance note: we might be able to gain further performance
1030 // with SIMD intrinsics.
1031 for (auto pos = view.begin(); pos != view.end(); ++pos) {
1032 if (authority_delimiter_special[(uint8_t)*pos]) {
1033 return pos - view.begin();
1034 }
1035 }
1036 return size_t(view.size());
1037}
1038
1039// @ / ?
1040static constexpr std::array<uint8_t, 256> authority_delimiter = []() consteval {
1041 std::array<uint8_t, 256> result{};
1042 for (uint8_t i : {'@', '/', '?'}) {
1043 result[i] = 1;
1044 }
1045 return result;
1046}();
1047// credit: @the-moisrex recommended a table-based approach
1048ada_really_inline size_t
1049find_authority_delimiter(std::string_view view) noexcept {
1050 // performance note: we might be able to gain further performance
1051 // with SIMD instrinsics.
1052 for (auto pos = view.begin(); pos != view.end(); ++pos) {
1053 if (authority_delimiter[(uint8_t)*pos]) {
1054 return pos - view.begin();
1055 }
1056 }
1057 return size_t(view.size());
1058}
1059
1060} // namespace ada::helpers
1061
1062namespace ada {
1066#undef ada_make_uint8x16_t
1067} // namespace ada
Definitions for URL specific checkers used within Ada.
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:88
#define ada_warn_unused
Definition common_defs.h:89
#define ada_really_inline
Definition common_defs.h:85
Definitions for helper functions used within Ada.
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:728
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:1018
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:720
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:467
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:42
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:1040
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:458
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:171
type
Enumeration of URL scheme types.
Definition scheme.h:41
@ NOT_SPECIAL
Definition scheme.h:43
Definition ada_idna.h:13
state
States in the URL parsing state machine.
Definition state.h:27
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:101
@ FILE_SLASH
Definition state.h:81
@ SCHEME
Definition state.h:41
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:96
@ FRAGMENT
Definition state.h:56
@ FILE_HOST
Definition state.h:76
@ OPAQUE_PATH
Definition state.h:121
@ RELATIVE_SLASH
Definition state.h:66
@ NO_SCHEME
Definition state.h:51
@ PATH_START
Definition state.h:116
@ RELATIVE_SCHEME
Definition state.h:61
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:91
@ SCHEME_START
Definition state.h:36
@ AUTHORITY
Definition state.h:31
@ PATH_OR_AUTHORITY
Definition state.h:86
ada_warn_unused std::string_view to_string(encoding_type type)
tl::expected< result_type, ada::errors > result
URL scheme type definitions and utilities.
URL parser state machine states.