Ada 3.4.4
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada/parser-inl.h"
2
3#include <limits>
4#include <ranges>
5
7#include "ada/common_defs.h"
8#include "ada/log.h"
9#include "ada/unicode.h"
10
11namespace ada::parser {
12
13template <class result_type, bool store_values>
14result_type parse_url_impl(std::string_view user_input,
15 const result_type* base_url) {
16 // We can specialize the implementation per type.
17 // Important: result_type_is_ada_url is evaluated at *compile time*. This
18 // means that doing if constexpr(result_type_is_ada_url) { something } else {
19 // something else } is free (at runtime). This means that ada::url_aggregator
20 // and ada::url **do not have to support the exact same API**.
21 constexpr bool result_type_is_ada_url = std::is_same_v<url, result_type>;
22 constexpr bool result_type_is_ada_url_aggregator =
23 std::is_same_v<url_aggregator, result_type>;
24 static_assert(result_type_is_ada_url ||
25 result_type_is_ada_url_aggregator); // We don't support
26 // anything else for now.
27
28 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
29 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
30 ")");
31
33 result_type url{};
34
35 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
36 // surely the result of a bug or are otherwise a security concern.
37 if (user_input.size() > std::numeric_limits<uint32_t>::max()) [[unlikely]] {
38 url.is_valid = false;
39 }
40 // Going forward, user_input.size() is in [0,
41 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
42 // base, or the optional_url was invalid, we must return.
43 if (base_url != nullptr) {
44 url.is_valid &= base_url->is_valid;
45 }
46 if (!url.is_valid) {
47 return url;
48 }
49 if constexpr (result_type_is_ada_url_aggregator && store_values) {
50 // Most of the time, we just need user_input.size().
51 // In some instances, we may need a bit more.
53 // This is *very* important. This line should *not* be removed
54 // hastily. There are principled reasons why reserve is important
55 // for performance. If you have a benchmark with small inputs,
56 // it may not matter, but in other instances, it could.
58 // This rounds up to the next power of two.
59 // We know that user_input.size() is in [0,
60 // std::numeric_limits<uint32_t>::max).
61 uint32_t reserve_capacity =
62 (0xFFFFFFFF >>
63 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
64 1;
65 url.reserve(reserve_capacity);
66 }
67 std::string tmp_buffer;
68 std::string_view url_data;
69 if (unicode::has_tabs_or_newline(user_input)) [[unlikely]] {
70 tmp_buffer = user_input;
71 // Optimization opportunity: Instead of copying and then pruning, we could
72 // just directly build the string from user_input.
73 helpers::remove_ascii_tab_or_newline(tmp_buffer);
74 url_data = tmp_buffer;
75 } else [[likely]] {
76 url_data = user_input;
77 }
78
79 // Leading and trailing control characters are uncommon and easy to deal with
80 // (no performance concern).
81 helpers::trim_c0_whitespace(url_data);
82
83 // Optimization opportunity. Most websites do not have fragment.
84 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
85 // We add it last so that an implementation like ada::url_aggregator
86 // can append it last to its internal buffer, thus improving performance.
87
88 // Here url_data no longer has its fragment.
89 // We are going to access the data from url_data (it is immutable).
90 // At any given time, we are pointing at byte 'input_position' in url_data.
91 // The input_position variable should range from 0 to input_size.
92 // It is illegal to access url_data at input_size.
93 size_t input_position = 0;
94 const size_t input_size = url_data.size();
95 // Keep running the following state machine by switching on state.
96 // If after a run pointer points to the EOF code point, go to the next step.
97 // Otherwise, increase pointer by 1 and continue with the state machine.
98 // We never decrement input_position.
99 while (input_position <= input_size) {
100 ada_log("In parsing at ", input_position, " out of ", input_size,
101 " in state ", ada::to_string(state));
102 switch (state) {
103 case state::SCHEME_START: {
104 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
105 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
106 // state to scheme state.
107 if ((input_position != input_size) &&
108 checkers::is_alpha(url_data[input_position])) {
110 input_position++;
111 } else {
112 // Otherwise, if state override is not given, set state to no scheme
113 // state and decrease pointer by 1.
115 }
116 break;
117 }
118 case state::SCHEME: {
119 ada_log("SCHEME ", helpers::substring(url_data, input_position));
120 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
121 // append c, lowercased, to buffer.
122 while ((input_position != input_size) &&
123 (unicode::is_alnum_plus(url_data[input_position]))) {
124 input_position++;
125 }
126 // Otherwise, if c is U+003A (:), then:
127 if ((input_position != input_size) &&
128 (url_data[input_position] == ':')) {
129 ada_log("SCHEME the scheme should be ",
130 url_data.substr(0, input_position));
131 if constexpr (result_type_is_ada_url) {
132 if (!url.parse_scheme(url_data.substr(0, input_position))) {
133 return url;
134 }
135 } else {
136 // we pass the colon along instead of painfully adding it back.
137 if (!url.parse_scheme_with_colon(
138 url_data.substr(0, input_position + 1))) {
139 return url;
140 }
141 }
142 ada_log("SCHEME the scheme is ", url.get_protocol());
143
144 // If url's scheme is "file", then:
145 if (url.type == scheme::type::FILE) {
146 // Set state to file state.
148 }
149 // Otherwise, if url is special, base is non-null, and base's scheme
150 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
151 // != nullptr is false.
152 else if (url.is_special() && base_url != nullptr &&
153 base_url->type == url.type) {
154 // Set state to special relative or authority state.
156 }
157 // Otherwise, if url is special, set state to special authority
158 // slashes state.
159 else if (url.is_special()) {
161 }
162 // Otherwise, if remaining starts with an U+002F (/), set state to
163 // path or authority state and increase pointer by 1.
164 else if (input_position + 1 < input_size &&
165 url_data[input_position + 1] == '/') {
167 input_position++;
168 }
169 // Otherwise, set url's path to the empty string and set state to
170 // opaque path state.
171 else {
173 }
174 }
175 // Otherwise, if state override is not given, set buffer to the empty
176 // string, state to no scheme state, and start over (from the first code
177 // point in input).
178 else {
180 input_position = 0;
181 break;
182 }
183 input_position++;
184 break;
185 }
186 case state::NO_SCHEME: {
187 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
188 // If base is null, or base has an opaque path and c is not U+0023 (#),
189 // validation error, return failure.
190 if (base_url == nullptr ||
191 (base_url->has_opaque_path && !fragment.has_value())) {
192 ada_log("NO_SCHEME validation error");
193 url.is_valid = false;
194 return url;
195 }
196 // Otherwise, if base has an opaque path and c is U+0023 (#),
197 // set url's scheme to base's scheme, url's path to base's path, url's
198 // query to base's query, and set state to fragment state.
199 else if (base_url->has_opaque_path && fragment.has_value() &&
200 input_position == input_size) {
201 ada_log("NO_SCHEME opaque base with fragment");
202 url.copy_scheme(*base_url);
203 url.has_opaque_path = base_url->has_opaque_path;
204
205 if constexpr (result_type_is_ada_url) {
206 url.path = base_url->path;
207 url.query = base_url->query;
208 } else {
209 url.update_base_pathname(base_url->get_pathname());
210 if (base_url->has_search()) {
211 // get_search() returns "" for an empty query string (URL ends
212 // with '?'). update_base_search("") would incorrectly clear the
213 // query, so pass "?" to preserve the empty query distinction.
214 auto s = base_url->get_search();
215 url.update_base_search(s.empty() ? std::string_view("?") : s);
216 }
217 }
218 url.update_unencoded_base_hash(*fragment);
219 return url;
220 }
221 // Otherwise, if base's scheme is not "file", set state to relative
222 // state and decrease pointer by 1.
223 else if (base_url->type != scheme::type::FILE) {
224 ada_log("NO_SCHEME non-file relative path");
226 }
227 // Otherwise, set state to file state and decrease pointer by 1.
228 else {
229 ada_log("NO_SCHEME file base type");
231 }
232 break;
233 }
234 case state::AUTHORITY: {
235 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
236 // most URLs have no @. Having no @ tells us that we don't have to worry
237 // about AUTHORITY. Of course, we could have @ and still not have to
238 // worry about AUTHORITY.
239 // TODO: Instead of just collecting a bool, collect the location of the
240 // '@' and do something useful with it.
241 // TODO: We could do various processing early on, using a single pass
242 // over the string to collect information about it, e.g., telling us
243 // whether there is a @ and if so, where (or how many).
244
245 // Check if url data contains an @.
246 if (url_data.find('@', input_position) == std::string_view::npos) {
248 break;
249 }
250 bool at_sign_seen{false};
251 bool password_token_seen{false};
257 do {
258 std::string_view view = url_data.substr(input_position);
259 // The delimiters are @, /, ? \\.
260 size_t location =
261 url.is_special() ? helpers::find_authority_delimiter_special(view)
262 : helpers::find_authority_delimiter(view);
263 std::string_view authority_view = view.substr(0, location);
264 size_t end_of_authority = input_position + authority_view.size();
265 // If c is U+0040 (@), then:
266 if ((end_of_authority != input_size) &&
267 (url_data[end_of_authority] == '@')) {
268 // If atSignSeen is true, then prepend "%40" to buffer.
269 if (at_sign_seen) {
270 if (password_token_seen) {
271 if constexpr (result_type_is_ada_url) {
272 url.password += "%40";
273 } else {
274 url.append_base_password("%40");
275 }
276 } else {
277 if constexpr (result_type_is_ada_url) {
278 url.username += "%40";
279 } else {
280 url.append_base_username("%40");
281 }
282 }
283 }
284
285 at_sign_seen = true;
286
287 if (!password_token_seen) {
288 size_t password_token_location = authority_view.find(':');
289 password_token_seen =
290 password_token_location != std::string_view::npos;
291
292 if constexpr (store_values) {
293 if (!password_token_seen) {
294 if constexpr (result_type_is_ada_url) {
295 url.username += unicode::percent_encode(
296 authority_view,
298 } else {
299 url.append_base_username(unicode::percent_encode(
300 authority_view,
302 }
303 } else {
304 if constexpr (result_type_is_ada_url) {
305 url.username += unicode::percent_encode(
306 authority_view.substr(0, password_token_location),
308 url.password += unicode::percent_encode(
309 authority_view.substr(password_token_location + 1),
311 } else {
312 url.append_base_username(unicode::percent_encode(
313 authority_view.substr(0, password_token_location),
315 url.append_base_password(unicode::percent_encode(
316 authority_view.substr(password_token_location + 1),
318 }
319 }
320 }
321 } else if constexpr (store_values) {
322 if constexpr (result_type_is_ada_url) {
323 url.password += unicode::percent_encode(
325 } else {
326 url.append_base_password(unicode::percent_encode(
328 }
329 }
330 }
331 // Otherwise, if one of the following is true:
332 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
333 // - url is special and c is U+005C (\‍)
334 else if (end_of_authority == input_size ||
335 url_data[end_of_authority] == '/' ||
336 url_data[end_of_authority] == '?' ||
337 (url.is_special() && url_data[end_of_authority] == '\\')) {
338 // If atSignSeen is true and authority_view is the empty string,
339 // validation error, return failure.
340 if (at_sign_seen && authority_view.empty()) {
341 url.is_valid = false;
342 return url;
343 }
345 break;
346 }
347 if (end_of_authority == input_size) {
348 if constexpr (store_values) {
349 if (fragment.has_value()) {
350 url.update_unencoded_base_hash(*fragment);
351 }
352 }
353 return url;
354 }
355 input_position = end_of_authority + 1;
356 } while (true);
357
358 break;
359 }
361 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
362 helpers::substring(url_data, input_position));
363
364 // If c is U+002F (/) and remaining starts with U+002F (/),
365 // then set state to special authority ignore slashes state and increase
366 // pointer by 1.
367 if (url_data.substr(input_position, 2) == "//") {
369 input_position += 2;
370 } else {
371 // Otherwise, validation error, set state to relative state and
372 // decrease pointer by 1.
374 }
375
376 break;
377 }
379 ada_log("PATH_OR_AUTHORITY ",
380 helpers::substring(url_data, input_position));
381
382 // If c is U+002F (/), then set state to authority state.
383 if ((input_position != input_size) &&
384 (url_data[input_position] == '/')) {
386 input_position++;
387 } else {
388 // Otherwise, set state to path state, and decrease pointer by 1.
390 }
391
392 break;
393 }
395 ada_log("RELATIVE_SCHEME ",
396 helpers::substring(url_data, input_position));
397
398 // Set url's scheme to base's scheme.
399 url.copy_scheme(*base_url);
400
401 // If c is U+002F (/), then set state to relative slash state.
402 if ((input_position != input_size) &&
403 // NOLINTNEXTLINE(bugprone-branch-clone)
404 (url_data[input_position] == '/')) {
405 ada_log(
406 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
407 "slash state");
409 } else if (url.is_special() && (input_position != input_size) &&
410 (url_data[input_position] == '\\')) {
411 // Otherwise, if url is special and c is U+005C (\‍), validation error,
412 // set state to relative slash state.
413 ada_log(
414 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
415 "error, set state to relative slash state");
417 } else {
418 ada_log("RELATIVE_SCHEME otherwise");
419 // Set url's username to base's username, url's password to base's
420 // password, url's host to base's host, url's port to base's port,
421 // url's path to a clone of base's path, and url's query to base's
422 // query.
423 if constexpr (result_type_is_ada_url) {
424 url.username = base_url->username;
425 url.password = base_url->password;
426 url.host = base_url->host;
427 url.port = base_url->port;
428 // cloning the base path includes cloning the has_opaque_path flag
429 url.has_opaque_path = base_url->has_opaque_path;
430 url.path = base_url->path;
431 url.query = base_url->query;
432 } else {
433 url.update_base_authority(base_url->get_href(),
434 base_url->get_components());
435 url.update_host_to_base_host(base_url->get_hostname());
436 url.update_base_port(base_url->retrieve_base_port());
437 // cloning the base path includes cloning the has_opaque_path flag
438 url.has_opaque_path = base_url->has_opaque_path;
439 url.update_base_pathname(base_url->get_pathname());
440 if (base_url->has_search()) {
441 // get_search() returns "" for an empty query string (URL ends
442 // with '?'). update_base_search("") would incorrectly clear the
443 // query, so pass "?" to preserve the empty query distinction.
444 auto s = base_url->get_search();
445 url.update_base_search(s.empty() ? std::string_view("?") : s);
446 }
447 }
448
449 url.has_opaque_path = base_url->has_opaque_path;
450
451 // If c is U+003F (?), then set url's query to the empty string, and
452 // state to query state.
453 if ((input_position != input_size) &&
454 (url_data[input_position] == '?')) {
456 }
457 // Otherwise, if c is not the EOF code point:
458 else if (input_position != input_size) {
459 // Set url's query to null.
460 url.clear_search();
461 if constexpr (result_type_is_ada_url) {
462 // Shorten url's path.
463 helpers::shorten_path(url.path, url.type);
464 } else {
465 std::string_view path = url.get_pathname();
466 if (helpers::shorten_path(path, url.type)) {
467 url.update_base_pathname(std::move(std::string(path)));
468 }
469 }
470 // Set state to path state and decrease pointer by 1.
472 break;
473 }
474 }
475 input_position++;
476 break;
477 }
479 ada_log("RELATIVE_SLASH ",
480 helpers::substring(url_data, input_position));
481
482 // If url is special and c is U+002F (/) or U+005C (\‍), then:
483 if (url.is_special() && (input_position != input_size) &&
484 (url_data[input_position] == '/' ||
485 url_data[input_position] == '\\')) {
486 // Set state to special authority ignore slashes state.
488 }
489 // Otherwise, if c is U+002F (/), then set state to authority state.
490 else if ((input_position != input_size) &&
491 (url_data[input_position] == '/')) {
493 }
494 // Otherwise, set
495 // - url's username to base's username,
496 // - url's password to base's password,
497 // - url's host to base's host,
498 // - url's port to base's port,
499 // - state to path state, and then, decrease pointer by 1.
500 else {
501 if constexpr (result_type_is_ada_url) {
502 url.username = base_url->username;
503 url.password = base_url->password;
504 url.host = base_url->host;
505 url.port = base_url->port;
506 } else {
507 url.update_base_authority(base_url->get_href(),
508 base_url->get_components());
509 url.update_host_to_base_host(base_url->get_hostname());
510 url.update_base_port(base_url->retrieve_base_port());
511 }
513 break;
514 }
515
516 input_position++;
517 break;
518 }
520 ada_log("SPECIAL_AUTHORITY_SLASHES ",
521 helpers::substring(url_data, input_position));
522
523 // If c is U+002F (/) and remaining starts with U+002F (/),
524 // then set state to special authority ignore slashes state and increase
525 // pointer by 1.
526 if (url_data.substr(input_position, 2) == "//") {
527 input_position += 2;
528 }
529
530 [[fallthrough]];
531 }
533 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
534 helpers::substring(url_data, input_position));
535
536 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
537 // authority state and decrease pointer by 1.
538 while ((input_position != input_size) &&
539 ((url_data[input_position] == '/') ||
540 (url_data[input_position] == '\\'))) {
541 input_position++;
542 }
544
545 break;
546 }
547 case state::QUERY: {
548 ada_log("QUERY ", helpers::substring(url_data, input_position));
549 if constexpr (store_values) {
550 // Let queryPercentEncodeSet be the special-query percent-encode set
551 // if url is special; otherwise the query percent-encode set.
552 const uint8_t* query_percent_encode_set =
555
556 // Percent-encode after encoding, with encoding, buffer, and
557 // queryPercentEncodeSet, and append the result to url's query.
558 url.update_base_search(url_data.substr(input_position),
559 query_percent_encode_set);
560 ada_log("QUERY update_base_search completed ");
561 if (fragment.has_value()) {
562 url.update_unencoded_base_hash(*fragment);
563 }
564 }
565 return url;
566 }
567 case state::HOST: {
568 ada_log("HOST ", helpers::substring(url_data, input_position));
569
570 std::string_view host_view = url_data.substr(input_position);
571 auto [location, found_colon] =
572 helpers::get_host_delimiter_location(url.is_special(), host_view);
573 input_position = (location != std::string_view::npos)
574 ? input_position + location
575 : input_size;
576 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
577 // Note: the 'found_colon' value is true if and only if a colon was
578 // encountered while not inside brackets.
579 if (found_colon) {
580 // If buffer is the empty string, validation error, return failure.
581 // Let host be the result of host parsing buffer with url is not
582 // special.
583 ada_log("HOST parsing ", host_view);
584 if (!url.parse_host(host_view)) {
585 return url;
586 }
587 ada_log("HOST parsing results in ", url.get_hostname());
588 // Set url's host to host, buffer to the empty string, and state to
589 // port state.
591 input_position++;
592 }
593 // Otherwise, if one of the following is true:
594 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
595 // - url is special and c is U+005C (\‍)
596 // The get_host_delimiter_location function either brings us to
597 // the colon outside of the bracket, or to one of those characters.
598 else {
599 // If url is special and host_view is the empty string, validation
600 // error, return failure.
601 if (host_view.empty() && url.is_special()) {
602 url.is_valid = false;
603 return url;
604 }
605 ada_log("HOST parsing ", host_view, " href=", url.get_href());
606 // Let host be the result of host parsing host_view with url is not
607 // special.
608 if (host_view.empty()) {
609 url.update_base_hostname("");
610 } else if (!url.parse_host(host_view)) {
611 return url;
612 }
613 ada_log("HOST parsing results in ", url.get_hostname(),
614 " href=", url.get_href());
615
616 // Set url's host to host, and state to path start state.
618 }
619
620 break;
621 }
622 case state::OPAQUE_PATH: {
623 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
624 std::string_view view = url_data.substr(input_position);
625 // If c is U+003F (?), then set url's query to the empty string and
626 // state to query state.
627 size_t location = view.find('?');
628 if (location != std::string_view::npos) {
629 view.remove_suffix(view.size() - location);
631 input_position += location + 1;
632 } else {
633 input_position = input_size + 1;
634 }
635 url.has_opaque_path = true;
636
637 // This is a really unlikely scenario in real world. We should not seek
638 // to optimize it.
639 if (view.ends_with(' ')) {
640 std::string modified_view =
641 std::string(view.substr(0, view.size() - 1)) + "%20";
642 url.update_base_pathname(unicode::percent_encode(
644 } else {
645 url.update_base_pathname(unicode::percent_encode(
647 }
648 break;
649 }
650 case state::PORT: {
651 ada_log("PORT ", helpers::substring(url_data, input_position));
652 std::string_view port_view = url_data.substr(input_position);
653 input_position += url.parse_port(port_view, true);
654 if (!url.is_valid) {
655 return url;
656 }
658 [[fallthrough]];
659 }
660 case state::PATH_START: {
661 ada_log("PATH_START ", helpers::substring(url_data, input_position));
662
663 // If url is special, then:
664 if (url.is_special()) {
665 // Set state to path state.
667
668 // Optimization: Avoiding going into PATH state improves the
669 // performance of urls ending with /.
670 if (input_position == input_size) {
671 if constexpr (store_values) {
672 url.update_base_pathname("/");
673 if (fragment.has_value()) {
674 url.update_unencoded_base_hash(*fragment);
675 }
676 }
677 return url;
678 }
679 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
680 // by 1. We know that (input_position == input_size) is impossible
681 // here, because of the previous if-check.
682 if ((url_data[input_position] != '/') &&
683 (url_data[input_position] != '\\')) {
684 break;
685 }
686 }
687 // Otherwise, if state override is not given and c is U+003F (?),
688 // set url's query to the empty string and state to query state.
689 else if ((input_position != input_size) &&
690 (url_data[input_position] == '?')) {
692 }
693 // Otherwise, if c is not the EOF code point:
694 else if (input_position != input_size) {
695 // Set state to path state.
697
698 // If c is not U+002F (/), then decrease pointer by 1.
699 if (url_data[input_position] != '/') {
700 break;
701 }
702 }
703
704 input_position++;
705 break;
706 }
707 case state::PATH: {
708 ada_log("PATH ", helpers::substring(url_data, input_position));
709 std::string_view view = url_data.substr(input_position);
710
711 // Most time, we do not need percent encoding.
712 // Furthermore, we can immediately locate the '?'.
713 size_t locofquestionmark = view.find('?');
714 if (locofquestionmark != std::string_view::npos) {
716 view.remove_suffix(view.size() - locofquestionmark);
717 input_position += locofquestionmark + 1;
718 } else {
719 input_position = input_size + 1;
720 }
721 if constexpr (store_values) {
722 if constexpr (result_type_is_ada_url) {
723 helpers::parse_prepared_path(view, url.type, url.path);
724 } else {
725 url.consume_prepared_path(view);
726 ADA_ASSERT_TRUE(url.validate());
727 }
728 }
729 break;
730 }
731 case state::FILE_SLASH: {
732 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
733
734 // If c is U+002F (/) or U+005C (\‍), then:
735 if ((input_position != input_size) &&
736 (url_data[input_position] == '/' ||
737 url_data[input_position] == '\\')) {
738 ada_log("FILE_SLASH c is U+002F or U+005C");
739 // Set state to file host state.
741 input_position++;
742 } else {
743 ada_log("FILE_SLASH otherwise");
744 // If base is non-null and base's scheme is "file", then:
745 // Note: it is unsafe to do base_url->scheme unless you know that
746 // base_url_has_value() is true.
747 if (base_url != nullptr && base_url->type == scheme::type::FILE) {
748 // Set url's host to base's host.
749 if constexpr (result_type_is_ada_url) {
750 url.host = base_url->host;
751 } else {
752 url.update_host_to_base_host(base_url->get_host());
753 }
754 // If the code point substring from pointer to the end of input does
755 // not start with a Windows drive letter and base's path[0] is a
756 // normalized Windows drive letter, then append base's path[0] to
757 // url's path.
758 if (!base_url->get_pathname().empty()) {
760 url_data.substr(input_position))) {
761 std::string_view first_base_url_path =
762 base_url->get_pathname().substr(1);
763 size_t loc = first_base_url_path.find('/');
764 if (loc != std::string_view::npos) {
765 helpers::resize(first_base_url_path, loc);
766 }
768 first_base_url_path)) {
769 if constexpr (result_type_is_ada_url) {
770 url.path += '/';
771 url.path += first_base_url_path;
772 } else {
773 url.append_base_pathname(
774 helpers::concat("/", first_base_url_path));
775 }
776 }
777 }
778 }
779 }
780
781 // Set state to path state, and decrease pointer by 1.
783 }
784
785 break;
786 }
787 case state::FILE_HOST: {
788 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
789 std::string_view view = url_data.substr(input_position);
790
791 size_t location = view.find_first_of("/\\?");
792 std::string_view file_host_buffer(
793 view.data(),
794 (location != std::string_view::npos) ? location : view.size());
795
796 if (checkers::is_windows_drive_letter(file_host_buffer)) {
798 } else if (file_host_buffer.empty()) {
799 // Set url's host to the empty string.
800 if constexpr (result_type_is_ada_url) {
801 url.host = "";
802 } else {
803 url.update_base_hostname("");
804 }
805 // Set state to path start state.
807 } else {
808 size_t consumed_bytes = file_host_buffer.size();
809 input_position += consumed_bytes;
810 // Let host be the result of host parsing buffer with url is not
811 // special.
812 if (!url.parse_host(file_host_buffer)) {
813 return url;
814 }
815
816 if constexpr (result_type_is_ada_url) {
817 // If host is "localhost", then set host to the empty string.
818 if (url.host.has_value() && url.host.value() == "localhost") {
819 url.host = "";
820 }
821 } else {
822 if (url.get_hostname() == "localhost") {
823 url.update_base_hostname("");
824 }
825 }
826
827 // Set buffer to the empty string and state to path start state.
829 }
830
831 break;
832 }
833 case state::FILE: {
834 ada_log("FILE ", helpers::substring(url_data, input_position));
835 std::string_view file_view = url_data.substr(input_position);
836
837 url.set_protocol_as_file();
838 if constexpr (result_type_is_ada_url) {
839 // Set url's host to the empty string.
840 url.host = "";
841 } else {
842 url.update_base_hostname("");
843 }
844 // If c is U+002F (/) or U+005C (\‍), then:
845 if (input_position != input_size &&
846 (url_data[input_position] == '/' ||
847 url_data[input_position] == '\\')) {
848 ada_log("FILE c is U+002F or U+005C");
849 // Set state to file slash state.
851 }
852 // Otherwise, if base is non-null and base's scheme is "file":
853 else if (base_url != nullptr && base_url->type == scheme::type::FILE) {
854 // Set url's host to base's host, url's path to a clone of base's
855 // path, and url's query to base's query.
856 ada_log("FILE base non-null");
857 if constexpr (result_type_is_ada_url) {
858 url.host = base_url->host;
859 url.path = base_url->path;
860 url.query = base_url->query;
861 } else {
862 url.update_host_to_base_host(base_url->get_hostname());
863 url.update_base_pathname(base_url->get_pathname());
864 if (base_url->has_search()) {
865 // get_search() returns "" for an empty query string (URL ends
866 // with '?'). update_base_search("") would incorrectly clear the
867 // query, so pass "?" to preserve the empty query distinction.
868 auto s = base_url->get_search();
869 url.update_base_search(s.empty() ? std::string_view("?") : s);
870 }
871 }
872 url.has_opaque_path = base_url->has_opaque_path;
873
874 // If c is U+003F (?), then set url's query to the empty string and
875 // state to query state.
876 if (input_position != input_size && url_data[input_position] == '?') {
878 }
879 // Otherwise, if c is not the EOF code point:
880 else if (input_position != input_size) {
881 // Set url's query to null.
882 url.clear_search();
883 // If the code point substring from pointer to the end of input does
884 // not start with a Windows drive letter, then shorten url's path.
885 if (!checkers::is_windows_drive_letter(file_view)) {
886 if constexpr (result_type_is_ada_url) {
887 helpers::shorten_path(url.path, url.type);
888 } else {
889 std::string_view path = url.get_pathname();
890 if (helpers::shorten_path(path, url.type)) {
891 url.update_base_pathname(std::move(std::string(path)));
892 }
893 }
894 }
895 // Otherwise:
896 else {
897 // Set url's path to an empty list.
898 url.clear_pathname();
899 url.has_opaque_path = true;
900 }
901
902 // Set state to path state and decrease pointer by 1.
904 break;
905 }
906 }
907 // Otherwise, set state to path state, and decrease pointer by 1.
908 else {
909 ada_log("FILE go to path");
911 break;
912 }
913
914 input_position++;
915 break;
916 }
917 default:
918 unreachable();
919 }
920 }
921 if constexpr (store_values) {
922 if (fragment.has_value()) {
923 url.update_unencoded_base_hash(*fragment);
924 }
925 }
926 return url;
927}
928
929template url parse_url_impl(std::string_view user_input,
930 const url* base_url = nullptr);
932 std::string_view user_input, const url_aggregator* base_url = nullptr);
933
934template <class result_type>
935result_type parse_url(std::string_view user_input,
936 const result_type* base_url) {
937 return parse_url_impl<result_type, true>(user_input, base_url);
938}
939
940template url parse_url<url>(std::string_view user_input,
941 const url* base_url = nullptr);
943 std::string_view user_input, const url_aggregator* base_url = nullptr);
944} // namespace ada::parser
Definitions of the character sets used by unicode functions.
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
Internal URL parsing implementation.
Definition parser-inl.h:16
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:935
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
result_type parse_url_impl(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:14
state
States in the URL parsing state machine.
Definition state.h:27
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:101
@ FILE_SLASH
Definition state.h:81
@ SCHEME
Definition state.h:41
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:96
@ FILE_HOST
Definition state.h:76
@ OPAQUE_PATH
Definition state.h:121
@ RELATIVE_SLASH
Definition state.h:66
@ NO_SCHEME
Definition state.h:51
@ PATH_START
Definition state.h:116
@ RELATIVE_SCHEME
Definition state.h:61
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:91
@ SCHEME_START
Definition state.h:36
@ AUTHORITY
Definition state.h:31
@ PATH_OR_AUTHORITY
Definition state.h:86
ada_warn_unused std::string_view to_string(encoding_type type)
void unreachable()
Memory-efficient URL representation using a single buffer.
ada_really_inline constexpr bool is_special() const noexcept
bool is_valid
Definition url_base.h:56
bool has_opaque_path
Definition url_base.h:62
Represents a parsed URL with individual string components.
Definition url.h:62
std::string get_protocol() const
Definition url.cpp:633
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hostname() const
Definition url.cpp:655
ada_really_inline std::string get_href() const
Definition url-inl.h:188
Definitions for all unicode specific functions.