% \iffalse meta-comment % %% File: l3regex.dtx % % Copyright (C) 2011-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3regex} module\\ Regular expressions in \TeX{}^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-12-25} % % \maketitle % % \begin{documentation} % \newenvironment{l3regex-syntax} % {\begin{itemize}\def\\{\char`\\}\def\makelabel##1{\hss\llap{\ttfamily##1}}} % {\end{itemize}} % % The \pkg{l3regex} module provides regular expression testing, % extraction of submatches, splitting, and replacement, all acting % on token lists. The syntax of regular expressions is mostly a subset % of the \textsc{pcre} syntax (and very close to \textsc{posix}), % with some additions % due to the fact that \TeX{} manipulates tokens rather than characters. % For performance reasons, only a limited set of features are implemented. % Notably, back-references are not supported. % % Let us give a few examples. After % \begin{verbatim} % \tl_set:Nn \l_my_tl { That~cat. } % \regex_replace_once:nnN { at } { is } \l_my_tl % \end{verbatim} % the token list variable \cs[no-index]{l_my_tl} holds the text % \enquote{\texttt{This cat.}}, where the first % occurrence of \enquote{\texttt{at}} was replaced % by \enquote{\texttt{is}}. A more complicated example is % a pattern to emphasize each word and add a comma after it: % \begin{verbatim} % \regex_replace_all:nnN { \w+ } { \c{emph}\cB\{ \0 \cE\} , } \l_my_tl % \end{verbatim} % The |\w| sequence represents any \enquote{word} character, and |+| % indicates that the |\w| sequence should be repeated as many times as % possible (at least once), hence matching a word in the input token % list. In the replacement text, |\0| denotes the full match (here, a % word). The command |\emph| is inserted using |\c{emph}|, and its % argument |\0| is put between braces |\cB\{| and |\cE\}|. % % If a regular expression is to be used several times, % it can be compiled once, and stored in a regex % variable using \cs{regex_set:Nn}. For example, % \begin{verbatim} % \regex_new:N \l_foo_regex % \regex_set:Nn \l_foo_regex { \c{begin} \cB. (\c[^BE].*) \cE. } % \end{verbatim} % stores in \cs[no-index]{l_foo_regex} a regular expression which matches the % starting marker for an environment: \cs[no-index]{begin}, followed by a % begin-group token (|\cB.|), then any number of tokens which are % neither begin-group nor end-group character tokens (|\c[^BE].*|), % ending with an end-group token (|\cE.|). As explained in the next % section, the parentheses \enquote{capture} the result of |\c[^BE].*|, % giving us access to the name of the environment when doing % replacements. % % \section{Syntax of regular expressions} % % \subsection{Regular expression examples} % % We start with a few examples, and encourage the reader to apply % \cs{regex_show:n} to these regular expressions. % \begin{itemize} % \item |Cat| matches the word \enquote{Cat} capitalized in this way, % but also matches the beginning of the word \enquote{Cattle}: use % |\bCat\b| to match a complete word only. % \item |[abc]| matches one letter among \enquote{a}, \enquote{b}, % \enquote{c}; the pattern \verb"(a|b|c)" matches the same three % possible letters (but see the discussion of submatches below). % \item |[A-Za-z]*| matches any number (due to the quantifier % \verb"*") of Latin letters (not accented). % \item |\c{[A-Za-z]*}| matches a control sequence made of Latin % letters. % \item |\_[^\_]*\_| matches an underscore, any number of characters % other than underscore, and another underscore; it is equivalent to % |\_.*?\_| where |.| matches arbitrary characters and the % lazy quantifier |*?| means to match as few characters as % possible, thus avoiding matching underscores. % \item |[\+\-]?\d+| matches an explicit integer with at most one % sign. % \item \verb*"[\+\-\ ]*\d+\ *" matches an explicit integer with any % number of $+$ and $-$ signs, with spaces allowed except within the % mantissa, and surrounded by spaces. % \item \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *" matches an explicit integer or % decimal number; using \verb*"[.,]" instead of \verb*"\." would allow % the comma as a decimal marker. % \item % \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *((?i)pt|in|[cem]m|ex|[bs]p|[dn]d|[pcn]c)\ *" % \allowbreak matches an explicit dimension with any unit that \TeX{} knows, where % \verb*"(?i)" means to treat lowercase and uppercase letters % identically. % \item \verb*"[\+\-\ ]*((?i)nan|inf|(\d+|\d*\.\d+)(\ *e[\+\-\ ]*\d+)?)\ *" % matches an explicit floating point number or the special values % \verb*"nan" and \verb*"inf" (with signs and spaces allowed). % \item \verb*"[\+\-\ ]*(\d+|\cC.)\ *" matches an explicit integer or % control sequence (without checking whether it is an integer % variable). % \item |\G.*?\K| at the beginning of a regular expression matches and % discards (due to |\K|) everything between the end of the previous % match (|\G|) and what is matched by the rest of the regular % expression; this is useful in \cs{regex_replace_all:nnN} when the % goal is to extract matches or submatches in a finer way than with % \cs{regex_extract_all:nnN}. % \end{itemize} % While it is impossible for a regular expression to match only integer % expressions, \verb*"[\+\-\(]*\d+\)*([\+\-*/][\+\-\(]*\d+\)*)*" matches among % other things all valid integer expressions (made only with explicit % integers). One should follow it with further testing. % % \subsection{Characters in regular expressions} % % Most characters match exactly themselves, % with an arbitrary category code. Some characters are % special and must be escaped with a backslash (\emph{e.g.}, |\*| % matches a star character). Some escape sequences of % the form backslash--letter also have a special meaning % (for instance |\d| matches any digit). As a rule, % \begin{itemize} % \item every alphanumeric character (\texttt{A}--\texttt{Z}, % \texttt{a}--\texttt{z}, \texttt{0}--\texttt{9}) matches % exactly itself, and should not be escaped, because % |\A|, |\B|, \ldots{} have special meanings; % \item non-alphanumeric printable ascii characters can (and should) % always be escaped: many of them have special meanings (\emph{e.g.}, % use |\(|, |\)|, |\?|, |\.|, |\^|); % \item spaces should always be escaped (even in character % classes); % \item any other character may be escaped or not, without any % effect: both versions match exactly that character. % \end{itemize} % Note that these rules play nicely with the fact that many % non-alphanumeric characters are difficult to input into \TeX{} % under normal category codes. For instance, |\\abc\%| % matches the characters |\abc%| (with arbitrary category codes), % but does not match the control sequence |\abc| followed by a % percent character. Matching control sequences can be done % using the |\c|\Arg{regex} syntax (see below). % % Any special character which appears at a place where its special % behaviour cannot apply matches itself instead (for instance, a % quantifier appearing at the beginning of a string), after raising a % warning. % % Characters. % \begin{l3regex-syntax} % \item[\\x\{hh\ldots{}\}] Character with hex code \texttt{hh\ldots{}} % \item[\\xhh] Character with hex code \texttt{hh}. % \item[\\a] Alarm (hex 07). % \item[\\e] Escape (hex 1B). % \item[\\f] Form-feed (hex 0C). % \item[\\n] New line (hex 0A). % \item[\\r] Carriage return (hex 0D). % \item[\\t] Horizontal tab (hex 09). % \end{l3regex-syntax} % % \subsection{Characters classes} % % Character properties. % \begin{l3regex-syntax} % \item[.] A single period matches any token. % \item[\\d] Any decimal digit. % \item[\\h] Any horizontal space character, % equivalent to |[\ \^^I]|: space and tab. % \item[\\s] Any space character, % equivalent to |[\ \^^I\^^J\^^L\^^M]|. % \item[\\v] Any vertical space character, % equivalent to |[\^^J\^^K\^^L\^^M]|. Note that |\^^K| is a vertical space, % but not a space, for compatibility with Perl. % \item[\\w] Any word character, \emph{i.e.}, % alphanumerics and underscore, equivalent to the explicit % class |[A-Za-z0-9\_]|. % \item[\\D] Any token not matched by |\d|. % \item[\\H] Any token not matched by |\h|. % \item[\\N] Any token other than the |\n| character (hex 0A). % \item[\\S] Any token not matched by |\s|. % \item[\\V] Any token not matched by |\v|. % \item[\\W] Any token not matched by |\w|. % \end{l3regex-syntax} % Of those, |.|, |\D|, |\H|, |\N|, |\S|, |\V|, and |\W| match arbitrary % control sequences. % % Character classes match exactly one token in the subject. % \begin{l3regex-syntax} % \item[{[\ldots{}]}] Positive character class. % Matches any of the specified tokens. % \item[{[\char`\^\ldots{}]}] Negative character class. % Matches any token other than the specified characters. % \item[{[x-y]}] Within a character class, this denotes a range (can be % used with escaped characters). % \item[{[:\meta{name}:]}] Within a character class (one more set of % brackets), this denotes the \textsc{posix} character class % \meta{name}, which can be \texttt{alnum}, \texttt{alpha}, % \texttt{ascii}, \texttt{blank}, \texttt{cntrl}, \texttt{digit}, % \texttt{graph}, \texttt{lower}, \texttt{print}, \texttt{punct}, % \texttt{space}, \texttt{upper}, \texttt{word}, or \texttt{xdigit}. % \item[{[:\char`\^\meta{name}:]}] Negative \textsc{posix} character class. % \end{l3regex-syntax} % For instance, |[a-oq-z\cC.]| matches any lowercase latin letter % except |p|, as well as control sequences (see below for a description % of |\c|). % % In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are % special, and should be escaped. Other non-alphanumeric characters can % still be escaped without harm. Any escape sequence which matches a % single character (|\d|, |\D|, \emph{etc.}) is supported in character % classes. If the first character is |^|, then % the meaning of the character class is inverted; |^| appearing anywhere % else in the range is not special. If the first character (possibly % following a leading |^|) is |]| then it does not need to be escaped % since ending the range there would make it empty. % Ranges of characters % can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are % equivalent. % % \subsection{Structure: alternatives, groups, repetitions} % % Quantifiers (repetition). % \begin{l3regex-syntax} % \item[?] $0$ or $1$, greedy. % \item[??] $0$ or $1$, lazy. % \item[*] $0$ or more, greedy. % \item[*?] $0$ or more, lazy. % \item[+] $1$ or more, greedy. % \item[+?] $1$ or more, lazy. % \item[\{$n$\}] Exactly $n$. % \item[\{$n,$\}] $n$ or more, greedy. % \item[\{$n,$\}?] $n$ or more, lazy. % \item[\{$n,m$\}] At least $n$, no more than $m$, greedy. % \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy. % \end{l3regex-syntax} % For greedy quantifiers the regex code will first investigate matches % that involve as many repetitions as possible, while for lazy % quantifiers it investigates matches with as few repetitions as % possible first. % % Alternation and capturing groups. % \begin{l3regex-syntax} % \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B}, % or \texttt{C}, investigating \texttt{A} first. % \item[(\ldots{})] Capturing group. % \item[(?:\ldots{})] Non-capturing group. % \item[(?\char`|\ldots{})] Non-capturing group which resets % the group number for capturing groups in each alternative. % The following group is numbered with the first unused % group number. % \end{l3regex-syntax} % % Capturing groups are a means of extracting information about the % match. Parenthesized groups are labelled in the order of their % opening parenthesis, starting at $1$. The contents of those groups % corresponding to the \enquote{best} match (leftmost longest) % can be extracted and stored in a sequence of token lists using for % instance \cs{regex_extract_once:nnNTF}. % % The |\K| escape sequence resets the beginning of the match to the % current position in the token list. This only affects what is reported % as the full match. For instance, % \begin{verbatim} % \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq % \end{verbatim} % results in \cs[no-index]{l_foo_seq} containing the items |{1}| and |{a}|: the % true matches are |{a1}| and |{aa}|, but they are trimmed by the use of % |\K|. The |\K| command does not affect capturing groups: for instance, % \begin{verbatim} % \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq % \end{verbatim} % results in \cs[no-index]{l_foo_seq} containing the items |{c3}| and |{bc}|: the % true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets % the beginning of the match to the last position where it appears. % % \subsection{Matching exact tokens} % % The |\c| escape sequence allows to test the category code of tokens, % and match control sequences. Each character category is represented % by a single uppercase letter: % \begin{itemize} % \item |C| for control sequences; % \item |B| for begin-group tokens; % \item |E| for end-group tokens; % \item |M| for math shift; % \item |T| for alignment tab tokens; % \item |P| for macro parameter tokens; % \item |U| for superscript tokens (up); % \item |D| for subscript tokens (down); % \item |S| for spaces; % \item |L| for letters; % \item |O| for others; and % \item |A| for active characters. % \end{itemize} % The |\c| escape sequence is used as follows. % \begin{l3regex-syntax} % \item[\\c\Arg{regex}] A control sequence whose csname matches the % \meta{regex}, anchored at the beginning and end, so that |\c{begin}| % matches exactly \cs[no-index]{begin}, and nothing else. % \item[\\cX] Applies to the next object, which can be a character, % escape character sequence such as |\x{0A}|, character class, or % group, and forces this object to only match tokens with category % |X| (any of |CBEMTPUDSLOA|. For instance, |\cL[A-Z\d]| matches % uppercase letters and digits of category code letter, |\cC.| % matches any control sequence, and |\cO(abc)| matches |abc| where % each character has category other.\footnote{This last example also % captures \enquote{\texttt{abc}} as a regex group; to avoid this % use a non-capturing group \texttt{\textbackslash cO(?:abc)}.} % \item[{\\c[XYZ]}] Applies to the next object, and forces it to only % match tokens with category |X|, |Y|, or |Z| (each being any of % |CBEMTPUDSLOA|). For instance, |\c[LSO](..)| matches two tokens of % category letter, space, or other. % \item[{\\c[\char`\^XYZ]}] Applies to the next object and prevents it % from matching any token with category |X|, |Y|, or |Z| (each being % any of |CBEMTPUDSLOA|). For instance, |\c[^O]\d| matches digits % which have any category different from other. % \end{l3regex-syntax} % The category code tests can be used inside classes; for instance, % |[\cO\d \c[LO][A-F]]| matches what \TeX{} considers as hexadecimal % digits, namely digits with category other, or uppercase letters from % |A| to |F| with category either letter or other. Within a group % affected by a category code test, the outer test can be overridden by % a nested test: for instance, |\cL(ab\cO\*cd)| matches |ab*cd| where % all characters are of category letter, except |*| which has category % other. % % The |\u| escape sequence allows to insert the contents of a token list % directly into a regular expression or a replacement, avoiding the need % to escape special characters. Namely, |\u|\Arg{var~name} matches % the exact contents (both character codes and category codes) of the % variable \cs[no-index]{\meta{var~name}}, % which are obtained by applying \cs{exp_not:v} \Arg{var~name} at the % time the regular expression is compiled. Within a |\c{...}| % control sequence matching, the |\u| escape sequence only expands its % argument once, in effect performing \cs{tl_to_str:v}. % Quantifiers are supported. % % The |\ur| escape sequence allows to insert the contents of a |regex| % variable into a larger regular expression. For instance, % |A\ur{l_tmpa_regex}D| matches the tokens |A| and |D| separated by % something that matches the regular expression % \cs[no-index]{l_tmpa_regex}. This behaves as if a non-capturing group % were surrounding \cs[no-index]{l_tmpa_regex}, and any group contained % in \cs[no-index]{l_tmpa_regex} is converted to a non-capturing group. % Quantifiers are supported. % % For instance, if \cs[no-index]{l_tmpa_regex} has value \verb"B|C", % then |A\ur{l_tmpa_regex}D| is equivalent to \verb"A(?:B|C)D" (matching % |ABD| or |ACD|) and not to \verb"AB|CD" (matching |AB| or |CD|). To % get the latter effect, it is simplest to use \TeX{}'s expansion % machinery directly: if \cs[no-index]{l_mymodule_BC_tl} contains % \verb"B|C" then the following two lines show the same result: % \begin{quote} % \cs{regex_show:n} |{ A \u{l_mymodule_BC_tl} D }| \\ % \cs{regex_show:n} \verb"{ A B | C D }" % \end{quote} % % \subsection{Miscellaneous} % % Anchors and simple assertions. % \begin{l3regex-syntax} % \item[\\b] Word boundary: either the previous token is matched by % |\w| and the next by |\W|, or the opposite. For this purpose, % the ends of the token list are considered as |\W|. % \item[\\B] Not a word boundary: between two |\w| tokens % or two |\W| tokens (including the boundary). % \item[\char`^ \textrm{or} \\A] % Start of the subject token list. % \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $ % End of the subject token list. % \item[\\G] Start of the current match. This is only different from |^| % in the case of multiple matches: for instance % |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but % replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the % value $1$. % \end{l3regex-syntax} % % The option |(?i)| makes the match case insensitive (treating % \texttt{A}--\texttt{Z} and \texttt{a}--\texttt{z} as equivalent, with % no support yet for Unicode case changing). This % applies until the end of the group in which it appears, and % can be reverted using |(?-i)|. For instance, in % \verb"(?i)(a(?-i)b|c)d", the letters |a| and |d| are affected by the % |i| option. Characters within ranges and classes are affected % individually: |(?i)[\?-B]| is equivalent to |[\?@ABab]| % (and differs from the much larger class |[\?-b]|), and % |(?i)[^aeiou]| matches any character which is not a vowel. % The |i|~option has no effect on |\c{...}|, on |\u{...}|, on character % properties, or on character classes, for instance it has no effect at % all in |(?i)\u{l_foo_tl}\d\d[[:lower:]]|. % % \section{Syntax of the replacement text} % % Most of the features described in regular expressions do not make % sense within the replacement text. Backslash introduces various % special constructions, described further below: % \begin{itemize} % \item |\0| is the whole match; % \item |\1| is the submatch that was matched by the first (capturing) % group |(...)|; similarly for |\2|, \ldots{}, |\9| and % |\g{|\meta{number}|}|; % \item \verb*|\ | inserts a space (spaces are ignored when not % escaped); % \item |\a|, |\e|, |\f|, |\n|, |\r|, |\t|, |\xhh|, |\x{hhh}| % correspond to single characters as in regular expressions; % \item |\c|\Arg{cs~name} inserts a control sequence; % \item |\c|\meta{category}\meta{character} (see below); % \item |\u|\Arg{tl~var~name} inserts the contents of the % \meta{tl~var} (see below). % \end{itemize} % Characters other than backslash and space are simply inserted in the % result (but since the replacement text is first converted to a string, % one should also escape characters that are special for \TeX{}, for % instance use~|\#|). Non-alphanumeric characters can always be safely % escaped with a backslash. % % For instance, % \begin{verbatim} % \tl_set:Nn \l_my_tl { Hello,~world! } % \regex_replace_all:nnN { ([er]?l|o) . } { (\0--\1) } \l_my_tl % \end{verbatim} % results in \cs[no-index]{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!| % % The submatches are numbered according to the order in which the % opening parenthesis of capturing groups appear in the regular % expression to match. The $n$-th submatch is empty if there are fewer % than $n$ capturing groups or for capturing groups that appear in % alternatives that were not used for the match. In case a capturing % group matches several times during a match (due to quantifiers) only % the last match is used in the replacement text. Submatches always keep % the same category codes as in the original token list. % % By default, the category code of characters inserted by the % replacement are determined by the prevailing category code regime at % the time where the replacement is made, with two exceptions: % \begin{itemize} % \item space characters (with character code $32$) inserted with % \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless % of the prevailing category code regime; % \item if the category code would be $0$~(escape), $5$~(newline), % $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by % $12$~(other) instead. % \end{itemize} % The escape sequence |\c| allows to insert characters % with arbitrary category codes, as well as control sequences. % \begin{l3regex-syntax} % \item[\\cX(\ldots{})] Produces the characters \enquote{\ldots{}} with % category~|X|, which must be one of |CBEMTPUDSLOA| as in regular % expressions. Parentheses are optional for a single character (which % can be an escape sequence). When nested, the innermost category % code applies, for instance |\cL(Hello\cS\ world)!| gives this text % with standard category codes. % \item[\\c\Arg{text}] Produces the control sequence with csname % \meta{text}. The \meta{text} may contain references to the % submatches |\0|, |\1|, and so on, as in the example for |\u| below. % \end{l3regex-syntax} % % The escape sequence |\u|\Arg{var~name} allows to insert the % contents of the variable with name \meta{var~name} directly into % the replacement, giving an easier control of category codes. When % nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the % |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely % extract the value of the control sequence and turn it into a string. % Matches can also be used within the arguments of |\c| and |\u|. For % instance, % \begin{verbatim} % \tl_set:Nn \l_my_one_tl { first } % \tl_set:Nn \l_my_two_tl { \emph{second} } % \tl_set:Nn \l_my_tl { one , two , one , one } % \regex_replace_all:nnN { [^,]+ } { \u{l_my_\0_tl} } \l_my_tl % \end{verbatim} % results in \cs[no-index]{l_my_tl} holding |first,\emph{second},first,first|. % % Regex replacement is also a convenient way to produce token lists % with arbitrary category codes. For instance % \begin{verbatim} % \tl_clear:N \l_tmpa_tl % \regex_replace_all:nnN { } { \cU\% \cA\~ } \l_tmpa_tl % \end{verbatim} % results in \cs[no-index]{l_tmpa_tl} containing the percent character % with category code~$7$ (superscript) and an active tilde character. % % \section{Pre-compiling regular expressions} % % If a regular expression is to be used several times, % it is better to compile it once rather than doing it % each time the regular expression is used. The compiled % regular expression is stored in a variable. All % of the \pkg{l3regex} module's functions can be given their % regular expression argument either as an explicit string % or as a compiled regular expression. % % \begin{function}[added = 2017-05-26]{\regex_new:N} % \begin{syntax} % \cs{regex_new:N} \meta{regex~var} % \end{syntax} % Creates a new \meta{regex~var} or raises an error if the % name is already taken. The declaration is global. The % \meta{regex~var} is initially such that it never matches. % \end{function} % % \begin{function}[added = 2017-05-26]{\regex_set:Nn, \regex_gset:Nn} % \begin{syntax} % \cs{regex_set:Nn} \meta{regex~var} \Arg{regex} % \end{syntax} % Stores a compiled version of the \meta{regex} in the % \meta{regex~var}. The assignment is local for \cs{regex_set:Nn} and % global for \cs{regex_gset:Nn}. For instance, this function can be % used as % \begin{verbatim} % \regex_new:N \l_my_regex % \regex_set:Nn \l_my_regex { my\ (simple\ )? reg(ex|ular\ expression) } % \end{verbatim} % \end{function} % % \begin{function}[added = 2017-05-26]{\regex_const:Nn} % \begin{syntax} % \cs{regex_const:Nn} \meta{regex~var} \Arg{regex} % \end{syntax} % Creates a new constant \meta{regex~var} or raises an error if the name % is already taken. The value of the \meta{regex~var} is set % globally to the compiled version of the \meta{regex}. % \end{function} % % \begin{function}[added = 2021-04-26, updated = 2021-04-29] % {\regex_show:N, \regex_show:n, \regex_log:N, \regex_log:n} % \begin{syntax} % \cs{regex_show:n} \Arg{regex} % \cs{regex_log:n} \Arg{regex} % \end{syntax} % Displays in the terminal or writes in the log file (respectively) % how \pkg{l3regex} interprets the \meta{regex}. For instance, % \cs{regex_show:n} \verb+{\A X|Y}+ shows % \begin{verbatim} % +-branch % anchor at start (\A) % char code 88 (X) % +-branch % char code 89 (Y) % \end{verbatim} % indicating that the anchor |\A| only applies to the first branch: % the second branch is not anchored to the beginning of the match. % \end{function} % % \section{Matching} % % All regular expression functions are available in both |:n| and |:N| % variants. The former require a \enquote{standard} regular expression, % while the later require a compiled expression as generated by % \cs{regex_set:Nn}. % % \begin{function}[TF, added = 2017-05-26] % { % \regex_match:nn, \regex_match:nV, % \regex_match:Nn, \regex_match:NV % } % \begin{syntax} % \cs{regex_match:nnTF} \Arg{regex} \Arg{token list} \Arg{true code} \Arg{false code} % \end{syntax} % Tests whether the \meta{regex} matches any part % of the \meta{token list}. For instance, % \begin{verbatim} % \regex_match:nnTF { b [cde]* } { abecdcx } { TRUE } { FALSE } % \regex_match:nnTF { [b-dq-w] } { example } { TRUE } { FALSE } % \end{verbatim} % leaves \texttt{TRUE} then \texttt{FALSE} in the input stream. % \end{function} % % \begin{function}[added = 2017-05-26] % { % \regex_count:nnN, \regex_count:nVN, % \regex_count:NnN, \regex_count:NVN % } % \begin{syntax} % \cs{regex_count:nnN} \Arg{regex} \Arg{token list} \meta{integer} % \end{syntax} % Sets \meta{integer} within the current \TeX{} group level % equal to the number of times % \meta{regex} appears in \meta{token list}. % The search starts by finding the left-most longest match, % respecting greedy and lazy (non-greedy) operators. Then the search % starts again from the character following the last character % of the previous match, until reaching the end of the token list. % Infinite loops are prevented in the case where the regular expression % can match an empty token list: then we count one match between each % pair of characters. % For instance, % \begin{verbatim} % \int_new:N \l_foo_int % \regex_count:nnN { (b+|c) } { abbababcbb } \l_foo_int % \end{verbatim} % results in \cs[no-index]{l_foo_int} taking the value $5$. % \end{function} % % \begin{function}[noTF, added = 2022-01-10]{\regex_match_case:nn} % \begin{syntax} % \cs{regex_match_case:nnTF} % ~~|{| \\ % ~~~~\Arg{regex_1} \Arg{code case_1} \\ % ~~~~\Arg{regex_2} \Arg{code case_2} \\ % ~~~~\ldots \\ % ~~~~\Arg{regex_n} \Arg{code case_n} \\ % ~~|}| \Arg{token list} % ~~\Arg{true code} \Arg{false code} % \end{syntax} % Determines which of the \meta{regular expressions} matches at the % earliest point in the \meta{token list}, and leaves the % corresponding \meta{code} followed by the \meta{true code} in the % input stream. If several \meta{regex} match starting at the same % point, then the first one in the list is selected and the others are % discarded. If none of the \meta{regex} match, the \meta{false code} % is left in the input stream. Each \meta{regex} can either be given % as a regex variable or as an explicit regular expression. % % In detail, for each starting position in the \meta{token list}, each % of the \meta{regex} is searched in turn. If one of them matches % then the corresponding \meta{code} is used and everything else is % discarded, while if none of the \meta{regex} match at a given % position then the next starting position is attempted. If none of % the \meta{regex} match anywhere in the \meta{token list} then % nothing is left in the input stream. Note that this differs from % nested \cs{regex_match:nnTF} statements since all \meta{regex} are % attempted at each position rather than attempting to match % \meta{regex_1} at every position before moving on to \meta{regex_2}. % \end{function} % % \section{Submatch extraction} % % \begin{function}[noTF, added = 2017-05-26] % { % \regex_extract_once:nnN, \regex_extract_once:nVN, % \regex_extract_once:NnN, \regex_extract_once:NVN % } % \begin{syntax} % \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var} % \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Finds the first match of the \meta{regex} in the % \meta{token list}. If it exists, the match is stored as the first % item of the \meta{seq~var}, and further items are the contents of % capturing groups, in the order of their opening parenthesis. The % \meta{seq~var} is assigned locally. If there is no match, the % \meta{seq~var} is cleared. The testing versions insert the % \meta{true code} into the input stream if a match was found, and the % \meta{false code} otherwise. % % For instance, assume that you type % \begin{verbatim} % \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq % { true } { false } % \end{verbatim} % Then the regular expression (anchored at the start with |\A| and % at the end with |\Z|) must match the whole token list. The first % capturing group, |(La)?|, matches |La|, and the second capturing % group, |(!*)|, matches |!!!|. Thus, \cs[no-index]{l_foo_seq} contains as a result % the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true} % branch is left in the input stream. % Note that the $n$-th item of \cs[no-index]{l_foo_seq}, as obtained using % \cs{seq_item:Nn}, correspond to the submatch numbered $(n-1)$ in % functions such as \cs{regex_replace_once:nnN}. % \end{function} % % \begin{function}[noTF, added = 2017-05-26] % { % \regex_extract_all:nnN, \regex_extract_all:nVN, % \regex_extract_all:NnN, \regex_extract_all:NVN % } % \begin{syntax} % \cs{regex_extract_all:nnN} \Arg{regex} \Arg{token list} \meta{seq~var} % \cs{regex_extract_all:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Finds all matches of the \meta{regex} % in the \meta{token list}, and stores all the submatch information % in a single sequence (concatenating the results of % multiple \cs{regex_extract_once:nnN} calls). % The \meta{seq~var} is assigned locally. If there is no match, % the \meta{seq~var} is cleared. % The testing versions insert the \meta{true code} into the input % stream if a match was found, and the \meta{false code} otherwise. % For instance, assume that you type % \begin{verbatim} % \regex_extract_all:nnNTF { \w+ } { Hello,~world! } \l_foo_seq % { true } { false } % \end{verbatim} % Then the regular expression matches twice, the resulting % sequence contains the two items |{Hello}| and |{world}|, % and the \texttt{true} branch is left in the input stream. % \end{function} % % \begin{function}[noTF, added = 2017-05-26] % { % \regex_split:nnN, \regex_split:nVN, % \regex_split:NnN, \regex_split:NVN, % } % \begin{syntax} % \cs{regex_split:nnN} \Arg{regex} \Arg{token list} \meta{seq~var} % \cs{regex_split:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Splits the \meta{token list} into a sequence of parts, delimited by % matches of the \meta{regex}. If the \meta{regex} % has capturing groups, then the token lists that they match are stored as % items of the sequence as well. The assignment to \meta{seq~var} is local. % If no match is found the resulting \meta{seq~var} has the % \meta{token list} as its sole item. If the \meta{regex} % matches the empty token list, then the \meta{token list} is split % into single tokens. % The testing versions insert the \meta{true code} into the input % stream if a match was found, and the \meta{false code} otherwise. % For example, after % \begin{verbatim} % \seq_new:N \l_path_seq % \regex_split:nnNTF { / } { the/path/for/this/file.tex } \l_path_seq % { true } { false } % \end{verbatim} % the sequence |\l_path_seq| contains the items |{the}|, |{path}|, % |{for}|, |{this}|, and |{file.tex}|, and the \texttt{true} branch % is left in the input stream. % \end{function} % % \section{Replacement} % % \begin{function}[noTF, added = 2017-05-26] % { % \regex_replace_once:nnN,\regex_replace_once:nVN, % \regex_replace_once:NnN,\regex_replace_once:NVN % } % \begin{syntax} % \cs{regex_replace_once:nnN} \Arg{regex} \Arg{replacement} \meta{tl~var} % \cs{regex_replace_once:nnNTF} \Arg{regex} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code} % \end{syntax} % Searches for the \meta{regex} in the contents of the % \meta{tl~var} and replaces the first match with the % \meta{replacement}. In the \meta{replacement}, % |\0| represents the full match, |\1| represent the contents of the % first capturing group, |\2| of the second, \emph{etc.} % The result is assigned locally to \meta{tl~var}. % \end{function} % % \begin{function}[noTF, added = 2017-05-26] % { % \regex_replace_all:nnN,\regex_replace_all:nVN, % \regex_replace_all:NnN,\regex_replace_all:NVN % } % \begin{syntax} % \cs{regex_replace_all:nnN} \Arg{regex} \Arg{replacement} \meta{tl~var} % \cs{regex_replace_all:nnNTF} \Arg{regex} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code} % \end{syntax} % Replaces all occurrences of the \meta{regex} in the % contents of the \meta{tl~var} % by the \meta{replacement}, where |\0| represents % the full match, |\1| represent the contents of the first capturing % group, |\2| of the second, \emph{etc.} Every match is treated % independently, and matches cannot overlap. The result is assigned % locally to \meta{tl~var}. % \end{function} % % \begin{function}[noTF, added = 2022-01-10]{\regex_replace_case_once:nN} % \begin{syntax} % \cs{regex_replace_case_once:nNTF} % ~~|{| \\ % ~~~~\Arg{regex_1} \Arg{replacement_1} \\ % ~~~~\Arg{regex_2} \Arg{replacement_2} \\ % ~~~~\ldots \\ % ~~~~\Arg{regex_n} \Arg{replacement_n} \\ % ~~|}| \meta{tl~var} % ~~\Arg{true code} \Arg{false code} % \end{syntax} % Replaces the earliest match of the regular expression % "(?|"\meta{regex_1}"|"\dots"|"\meta{regex_n}")" in the % \meta{tl var} by the \meta{replacement} corresponding to which % \meta{regex_i} matched, then leaves the \meta{true code} in the % input stream. If none of the \meta{regex} match, then the % \meta{tl~var} is not modified, and the \meta{false code} is left in % the input stream. Each \meta{regex} can either be given as a regex % variable or as an explicit regular expression. % % In detail, for each starting position in the \meta{token list}, each % of the \meta{regex} is searched in turn. If one of them matches % then it is replaced by the corresponding \meta{replacement} as % described for \cs{regex_replace_once:nnN}. This is equivalent to % checking with \cs{regex_match_case:nn} which \meta{regex} matches, % then performing the replacement with \cs{regex_replace_once:nnN}. % \end{function} % % \begin{function}[noTF, added = 2022-01-10]{\regex_replace_case_all:nN} % \begin{syntax} % \cs{regex_replace_case_all:nNTF} % ~~|{| \\ % ~~~~\Arg{regex_1} \Arg{replacement_1} \\ % ~~~~\Arg{regex_2} \Arg{replacement_2} \\ % ~~~~\ldots \\ % ~~~~\Arg{regex_n} \Arg{replacement_n} \\ % ~~|}| \meta{tl~var} % ~~\Arg{true code} \Arg{false code} % \end{syntax} % Replaces all occurrences of all \meta{regex} in the \meta{token % list} by the corresponding \meta{replacement}. Every match is % treated independently, and matches cannot overlap. The result is % assigned locally to \meta{tl~var}, and the \meta{true code} or % \meta{false code} is left in the input stream depending on whether % any replacement was made or not. % % In detail, for each starting position in the \meta{token list}, each % of the \meta{regex} is searched in turn. If one of them matches % then it is replaced by the corresponding \meta{replacement}, and the % search resumes at the position that follows this match (and % replacement). For instance % \begin{verbatim} % \tl_set:Nn \l_tmpa_tl { Hello,~world! } % \regex_replace_case_all:nN % { % { [A-Za-z]+ } { ``\0'' } % { \b } { --- } % { . } { [\0] } % } \l_tmpa_tl % \end{verbatim} % results in \cs{l_tmpa_tl} having the contents % \verb*|``Hello''---[,][ ]``world''---[!]|. Note in particular that % the word-boundary assertion |\b| did not match at the start of words % because the case |[A-Za-z]+| matched at these positions. To change % this, one could simply swap the order of the two cases in the % argument of \cs{regex_replace_case_all:nN}. % \end{function} % % \section{Scratch regular expressions} % % \begin{variable}[added = 2017-12-11]{\l_tmpa_regex, \l_tmpb_regex} % Scratch regex for local assignment. These are never used by % the kernel code, and so are safe for use with any \LaTeX3-defined % function. However, they may be overwritten by other non-kernel % code and so should only be used for short-term storage. % \end{variable} % % \begin{variable}[added = 2017-12-11]{\g_tmpa_regex, \g_tmpb_regex} % Scratch regex for global assignment. These are never used by % the kernel code, and so are safe for use with any \LaTeX3-defined % function. However, they may be overwritten by other non-kernel % code and so should only be used for short-term storage. % \end{variable} % % \section{Bugs, misfeatures, future work, and other possibilities} % % The following need to be done now. % \begin{itemize} % \item Rewrite the documentation in a more ordered way, perhaps add a % \textsc{bnf}? % \end{itemize} % % Additional error-checking to come. % \begin{itemize} % \item Clean up the use of messages. % \item Cleaner error reporting in the replacement phase. % \item Add tracing information. % \item Detect attempts to use back-references and other % non-implemented syntax. % \item Test for the maximum register \cs[no-index]{c_max_register_int}. % \item Find out whether the fact that |\W| and friends match the % end-marker leads to bugs. Possibly update \cs[no-index]{__regex_item_reverse:n}. % \item The empty cs should be matched by |\c{}|, not by % |\c{csname.?endcsname\s?}|. % \end{itemize} % % Code improvements to come. % \begin{itemize} % \item Shift arrays so that the useful information starts at % position~$1$. % \item Only build |\c{...}| once. % \item Use arrays for the left and right state stacks when % compiling a regex. % \item Should \cs[no-index]{__regex_action_free_group:n} only be used for greedy % |{n,}| quantifier? (I think not.) % \item Quantifiers for |\u| and assertions. % \item When matching, keep track of an explicit stack of % \texttt{curr_state} and \texttt{curr_submatches}. % \item If possible, when a state is reused by the same thread, kill % other subthreads. % \item Use an array rather than \cs[no-index]{g__regex_balance_tl} % to build the function \cs[no-index]{__regex_replacement_balance_one_match:n}. % \item Reduce the number of epsilon-transitions in alternatives. % \item Optimize simple strings: use less states (|abcade| should give % two states, for |abc| and |ade|). [Does that really make sense?] % \item Optimize groups with no alternative. % \item Optimize states with a single \cs[no-index]{__regex_action_free:n}. % \item Optimize the use of \cs[no-index]{__regex_action_success:} by inserting it % in state $2$ directly instead of having an extra transition. % \item Optimize the use of \cs[no-index]{int_step_...} functions. % \item Groups don't capture within regexes for csnames; optimize and % document. % \item Better \enquote{show} for anchors, properties, and catcode tests. % \item Does |\K| really need a new state for itself? % \item When compiling, use a boolean \texttt{in_cs} and less magic % numbers. % \end{itemize} % % The following features are likely to be implemented at some point % in the future. % \begin{itemize} % \item General look-ahead/behind assertions. % \item Regex matching on external files. % \item Conditional subpatterns with look ahead/behind: \enquote{if % what follows is [\ldots{}], then [\ldots{}]}. % \item |(*..)| and |(?..)| sequences to set some options. % \item UTF-8 mode for \pdfTeX{}. % \item Newline conventions are not done. % In particular, we should have an option for |.| not to match newlines. % Also, |\A| should differ from |^|, and |\Z|, |\z| and |$| should % differ. % \item Unicode properties: |\p{..}| and |\P{..}|; % |\X| which should match any \enquote{extended} Unicode sequence. % This requires to manipulate a lot of data, probably using tree-boxes. % \end{itemize} % % The following features of \textsc{pcre} or Perl may or may not be % implemented. % \begin{itemize} % \item Callout with |(?C...)| or other syntax: some internal code % changes make that possible, and it can be useful for instance in % the replacement code to stop a regex replacement when some marker % has been found; this raises the question of a potential % |\regex_break:| and then of playing well with \cs{tl_map_break:} % called from within the code in a regex. It also raises the % question of nested calls to the regex machinery, which is a % problem since \tn{fontdimen} are global. % \item Conditional subpatterns (other than with a look-ahead or % look-behind condition): this is non-regular, isn't it? % \item Named subpatterns: \TeX{} programmers have lived so far % without any need for named macro parameters. % \end{itemize} % % The following features of \textsc{pcre} or Perl will definitely not be % implemented. % \begin{itemize} % \item Back-references: non-regular feature, this requires % backtracking, which is prohibitively slow. % \item Recursion: this is a non-regular feature. % \item Atomic grouping, possessive quantifiers: those tools, mostly % meant to fix catastrophic backtracking, are unnecessary in a % non-backtracking algorithm, and difficult to implement. % \item Subroutine calls: this syntactic sugar is difficult to include % in a non-backtracking algorithm, in particular because the % corresponding group should be treated as atomic. % \item Backtracking control verbs: intrinsically tied to % backtracking. % \item |\ddd|, matching the character with octal code \texttt{ddd}: % we already have |\x{...}| and the syntax is confusingly close to % what we could have used for backreferences (|\1|, |\2|, \ldots{}), % making it harder to produce useful error message. % \item |\cx|, similar to \TeX{}'s own |\^^x|. % \item Comments: \TeX{} already has its own system for comments. % \item |\Q...\E| escaping: this would require to read the argument % verbatim, which is not in the scope of this module. % \item |\C| single byte in UTF-8 mode: \XeTeX{} and \LuaTeX{} serve % us characters directly, and splitting those into bytes is tricky, % encoding dependent, and most likely not useful anyways. % \end{itemize} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3regex} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=regex> % \end{macrocode} % % \subsection{Plan of attack} % % Most regex engines use backtracking. This allows to provide very % powerful features (back-references come to mind first), but it is % costly, and raises the problem of catastrophic backtracking. Since % \TeX{} is not first and foremost a programming language, complicated % code tends to run slowly, and we must use faster, albeit slightly more % restrictive, techniques, coming from automata theory. % % Given a regular expression of $n$ characters, we do the following: % \begin{itemize} % \item (Compiling.) Analyse the regex, finding invalid input, and % convert it to an internal representation. % \item (Building.) Convert the compiled regex to a non-deterministic % finite automaton (\textsc{nfa}) with $O(n)$ states which % accepts precisely token lists matching that regex. % \item (Matching.) Loop through the query token list one token (one % \enquote{position}) at a time, exploring in parallel every % possible path (\enquote{active thread}) through the \textsc{nfa}, % considering active threads in an order determined by the % quantifiers' greediness. % \end{itemize} % % We use the following vocabulary in the code comments (and in variable % names). % \begin{itemize} % \item \emph{Group}: index of the capturing group, $-1$ for % non-capturing groups. ^^A start/end index? % \item \emph{Position}: each token in the query is labelled by an % integer \meta{position}, with $\texttt{min_pos} - 1 \leq % \meta{position} \leq \texttt{max_pos}$. The lowest and highest % positions $\texttt{min_pos} - 1$ and $\texttt{max_pos}$ % correspond to imaginary begin and end markers (with % non-existent category code and character code). % $\texttt{max_pos}$ is only set quite late in the processing. % \item \emph{Query}: the token list to which we apply the regular % expression. % \item \emph{State}: each state of the \textsc{nfa} is labelled by an % integer \meta{state} with $\texttt{min_state} \leq \meta{state} < % \texttt{max_state}$. % \item \emph{Active thread}: state of the \textsc{nfa} that is reached % when reading the query token list for the matching. Those threads % are ordered according to the greediness of quantifiers. % \item \emph{Step}: used when matching, starts at $0$, incremented % every time a character is read, and is not reset when searching % for repeated matches. The integer \cs{l_@@_step_int} is a % unique id for all the steps of the matching algorithm. % \end{itemize} % % We use \pkg{l3intarray} to manipulate arrays of integers. % We also abuse \TeX{}'s % \tn{toks} registers, by accessing them directly by number rather than % tying them to control sequence using the \tn{newtoks} allocation % functions. Specifically, these arrays and \tn{toks} are used as % follows. When building, % \tn{toks}\meta{state} holds the tests and actions to perform in the % \meta{state} of the \textsc{nfa}. When matching, % \begin{itemize} % \item \cs{g_@@_state_active_intarray} holds the last \meta{step} in % which each \meta{state} was active. % \item \cs{g_@@_thread_info_intarray} consists of blocks for each % \meta{thread} (with $\texttt{min_thread} \leq \meta{thread} < % \texttt{max_thread}$). Each block has % $1+2\cs{l_@@_capturing_group_int}$ entries: the \meta{state} in % which the \meta{thread} currently is, followed by the beginnings % of all submatches, and then the ends of all submatches. The % \meta{threads} are ordered starting from the best to the least % preferred. % \item \cs{g_@@_submatch_prev_intarray}, \cs{g_@@_submatch_begin_intarray} % and \cs{g_@@_submatch_end_intarray} hold, for each submatch (as would % be extracted by \cs{regex_extract_all:nnN}), the place where the % submatch started to be looked for and its two end-points. For % historical reasons, the minimum index is twice \texttt{max_state}, % and the used registers go up to \cs{l_@@_submatch_int}. They are % organized in blocks of \cs{l_@@_capturing_group_int} entries, each % block corresponding to one match with all its submatches stored in % consecutive entries. % \end{itemize} % When actually building the result, % \begin{itemize} % \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}- % and \texttt{e}-expand to the \meta{position}-th token in the query. % \item \cs{g_@@_balance_intarray} holds the balance of begin-group and % end-group character tokens which appear before that point in the % token list. % \end{itemize} % % The code is structured as follows. Variables are introduced in the % relevant section. First we present some generic helper functions. Then % comes the code for compiling a regular expression, and for showing the % result of the compilation. The building phase converts a compiled % regex to \textsc{nfa} states, and the automaton is run by the code in % the following section. The only remaining brick is parsing the % replacement text and performing the replacement. We are then ready for % all the user functions. Finally, messages, and a little bit of tracing % code. % % \subsection{Helpers} % % \begin{macro}{\@@_int_eval:w} % Access the primitive: performance is key here, so we do not use % the slower route \emph{via} \cs{int_eval:n}. % \begin{macrocode} \cs_new_eq:NN \@@_int_eval:w \tex_numexpr:D % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_standard_escapechar:} % Make the \tn{escapechar} into the standard backslash. % \begin{macrocode} \cs_new_protected:Npn \@@_standard_escapechar: { \int_set:Nn \tex_escapechar:D { `\\ } } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_toks_use:w} % Unpack a \tn{toks} given its number. % \begin{macrocode} \cs_new:Npn \@@_toks_use:w { \tex_the:D \tex_toks:D } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_toks_clear:N, \@@_toks_set:Nn, \@@_toks_set:No} % Empty a \tn{toks} or set it to a value, given its number. % \begin{macrocode} \cs_new_protected:Npn \@@_toks_clear:N #1 { \tex_toks:D #1 = { } } \cs_new_eq:NN \@@_toks_set:Nn \tex_toks:D \cs_new_protected:Npn \@@_toks_set:No #1 { \tex_toks:D #1 = \exp_after:wN } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_toks_memcpy:NNn} % Copy |#3| \tn{toks} registers from |#2| onwards to |#1| onwards, % like |C|'s |memcpy|. % \begin{macrocode} \cs_new_protected:Npn \@@_toks_memcpy:NNn #1#2#3 { \prg_replicate:nn {#3} { \tex_toks:D #1 = \tex_toks:D #2 \int_incr:N #1 \int_incr:N #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_toks_put_left:Ne} % \begin{macro}{\@@_toks_put_right:Ne, \@@_toks_put_right:Nn} % During the building phase we wish to add \texttt{e}-expanded % material to \tn{toks}, either to the left or to the right. The % expansion is done \enquote{by hand} for optimization (these % operations are used quite a lot). The \texttt{Nn} version of % \cs{@@_toks_put_right:Ne} is provided because it is more % efficient than \texttt{e}-expanding with \cs{exp_not:n}. % \begin{macrocode} \cs_if_exist:NTF \tex_etokspre:D { \cs_new_eq:NN \@@_toks_put_left:Ne \tex_etokspre:D } { \cs_new_protected:Npn \@@_toks_put_left:Ne #1#2 { \tex_toks:D #1 = \tex_expanded:D {{ #2 \tex_the:D \tex_toks:D #1 }} } } \cs_if_exist:NTF \tex_etoksapp:D { \cs_new_eq:NN \@@_toks_put_right:Ne \tex_etoksapp:D } { \cs_new_protected:Npn \@@_toks_put_right:Ne #1#2 { \tex_toks:D #1 = \tex_expanded:D {{ \tex_the:D \tex_toks:D #1 #2 }} } } \cs_if_exist:NTF \tex_toksapp:D { \cs_new_eq:NN \@@_toks_put_right:Nn \tex_toksapp:D } { \cs_new_protected:Npn \@@_toks_put_right:Nn #1#2 { \tex_toks:D #1 = \exp_after:wN { \tex_the:D \tex_toks:D #1 #2 } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[rEXP]{\@@_curr_cs_to_str:} % Expands to the string representation of the token (known to be a % control sequence) at the current position \cs{l_@@_curr_pos_int}. % It should only be used in \texttt{e}/\texttt{x}-expansion to avoid losing a % leading space. % \begin{macrocode} \cs_new:Npn \@@_curr_cs_to_str: { \exp_after:wN \exp_after:wN \exp_after:wN \cs_to_str:N \l_@@_curr_token_tl } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_intarray_item:NnF, \@@_intarray_item_aux:nNF} % Item of intarray, with a default value. % \begin{macrocode} \cs_new:Npn \@@_intarray_item:NnF #1#2 { \exp_args:No \@@_intarray_item_aux:nNF { \tex_the:D \@@_int_eval:w #2 } #1 } \cs_new:Npn \@@_intarray_item_aux:nNF #1#2 { \if_int_compare:w #1 > \c_zero_int \exp_after:wN \use_ii:nnn \fi: \use_ii:nn { \__kernel_intarray_item:Nn #2 {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_maplike_break:} % Analogous to \cs{tl_map_break:}, this correctly exits % \cs{tl_map_inline:nn} and similar constructions and jumps to the % matching \cs{prg_break_point:Nn} \cs{@@_maplike_break:} |{| |}|. % \begin{macrocode} \cs_new:Npn \@@_maplike_break: { \prg_map_break:Nn \@@_maplike_break: { } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_tl_odd_items:n, \@@_tl_even_items:n, \@@_tl_even_items_loop:nn} % Map through a token list one pair at a time, leaving the % odd-numbered or even-numbered items (the first item is % numbered~$1$). % \begin{macrocode} \cs_new:Npn \@@_tl_odd_items:n #1 { \@@_tl_even_items:n { ? #1 } } \cs_new:Npn \@@_tl_even_items:n #1 { \@@_tl_even_items_loop:nn #1 \q_@@_nil \q_@@_nil \prg_break_point: } \cs_new:Npn \@@_tl_even_items_loop:nn #1#2 { \@@_use_none_delimit_by_q_nil:w #2 \prg_break: \q_@@_nil { \exp_not:n {#2} } \@@_tl_even_items_loop:nn } % \end{macrocode} % \end{macro} % % \subsubsection{Constants and variables} % % \begin{macro}{\@@_tmp:w} % Temporary function used for various short-term purposes. % \begin{macrocode} \cs_new:Npn \@@_tmp:w { } % \end{macrocode} % \end{macro} % % \begin{variable} % { % \l_@@_internal_a_tl, \l_@@_internal_b_tl, % \l_@@_internal_a_int, \l_@@_internal_b_int, % \l_@@_internal_c_int, \l_@@_internal_bool, % \l_@@_internal_seq, \g_@@_internal_tl, % } % Temporary variables used for various purposes. % \begin{macrocode} \tl_new:N \l_@@_internal_a_tl \tl_new:N \l_@@_internal_b_tl \int_new:N \l_@@_internal_a_int \int_new:N \l_@@_internal_b_int \int_new:N \l_@@_internal_c_int \bool_new:N \l_@@_internal_bool \seq_new:N \l_@@_internal_seq \tl_new:N \g_@@_internal_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_build_tl} % This temporary variable is specifically for use with the |tl_build| % machinery. % \begin{macrocode} \tl_new:N \l_@@_build_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\c_@@_no_match_regex} % This regular expression matches nothing, but is still a valid % regular expression. We could use a failing assertion, but I went for % an empty class. It is used as the initial value for regular % expressions declared using \cs{regex_new:N}. % \begin{macrocode} \tl_const:Nn \c_@@_no_match_regex { \@@_branch:n { \@@_class:NnnnN \c_true_bool { } { 1 } { 0 } \c_true_bool } } % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_balance_int} % During this phase, \cs{l_@@_balance_int} counts the balance of % begin-group and end-group character tokens which appear before a % given point in the token list. This variable is also used to keep % track of the balance in the replacement text. % \begin{macrocode} \int_new:N \l_@@_balance_int % \end{macrocode} % \end{variable} % % \subsubsection{Testing characters} % % \begin{macro}{\c_@@_ascii_min_int, \c_@@_ascii_max_control_int, \c_@@_ascii_max_int} % \begin{macrocode} \int_const:Nn \c_@@_ascii_min_int { 0 } \int_const:Nn \c_@@_ascii_max_control_int { 31 } \int_const:Nn \c_@@_ascii_max_int { 127 } % \end{macrocode} % \end{macro} % % \begin{variable}{\c_@@_ascii_lower_int} % \begin{macrocode} \int_const:Nn \c_@@_ascii_lower_int { `a - `A } % \end{macrocode} % \end{variable} % % \subsubsection{Internal auxiliaries} % % \begin{variable}{\q_@@_recursion_stop} % Internal recursion quarks. % \begin{macrocode} \quark_new:N \q_@@_recursion_stop % \end{macrocode} % \end{variable} % % \begin{variable}{\q_@@_nil} % Internal quarks. % \begin{macrocode} \quark_new:N \q_@@_nil % \end{macrocode} % \end{variable} % % \begin{macro}[EXP]{ % \@@_use_none_delimit_by_q_recursion_stop:w, % \@@_use_i_delimit_by_q_recursion_stop:nw, % \@@_use_none_delimit_by_q_nil:w, % } % Functions to gobble up to a quark. % \begin{macrocode} \cs_new:Npn \@@_use_none_delimit_by_q_recursion_stop:w #1 \q_@@_recursion_stop { } \cs_new:Npn \@@_use_i_delimit_by_q_recursion_stop:nw #1 #2 \q_@@_recursion_stop {#1} \cs_new:Npn \@@_use_none_delimit_by_q_nil:w #1 \q_@@_nil { } % \end{macrocode} % \end{macro} % % \begin{macro}[pTF]{\@@_quark_if_nil:n} % Branching quark conditional. % \begin{macrocode} \__kernel_quark_new_conditional:Nn \@@_quark_if_nil:N { F } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_break_point:TF} % \begin{macro}{\@@_break_true:w} % When testing whether a character of the query token list matches % a given character class in the regular expression, we often % have to test it against several ranges of characters, checking % if any one of those matches. This is done with a structure like % \begin{quote} % \meta{test1} \ldots{} \meta{test$\sb{n}$} \\ % \cs{@@_break_point:TF} \Arg{true code} \Arg{false code} % \end{quote} % If any of the tests succeeds, it calls \cs{@@_break_true:w}, % which cleans up and leaves \meta{true code} in the input stream. % Otherwise, \cs{@@_break_point:TF} leaves the \meta{false code} % in the input stream. % \begin{macrocode} \cs_new_protected:Npn \@@_break_true:w #1 \@@_break_point:TF #2 #3 {#2} \cs_new_protected:Npn \@@_break_point:TF #1 #2 { #2 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_item_reverse:n} % This function makes showing regular expressions easier, and lets us % define |\D| in terms of |\d| for instance. There is a subtlety: the % end of the query is marked by $-2$, and thus matches |\D| and % other negated properties; this case is caught by another part of % the code. % \begin{macrocode} \cs_new_protected:Npn \@@_item_reverse:n #1 { #1 \@@_break_point:TF { } \@@_break_true:w } % \end{macrocode} % \end{macro} % % \begin{macro} % {\@@_item_caseful_equal:n, \@@_item_caseful_range:nn} % Simple comparisons triggering \cs{@@_break_true:w} when true. % \begin{macrocode} \cs_new_protected:Npn \@@_item_caseful_equal:n #1 { \if_int_compare:w #1 = \l_@@_curr_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_item_caseful_range:nn #1 #2 { \reverse_if:N \if_int_compare:w #1 > \l_@@_curr_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_curr_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro} % {\@@_item_caseless_equal:n, \@@_item_caseless_range:nn} % For caseless matching, we perform the test both on the % \texttt{curr_char} and on the \texttt{case_changed_char}. Before % doing the second set of tests, we make sure that % \texttt{case_changed_char} has been computed. % \begin{macrocode} \cs_new_protected:Npn \@@_item_caseless_equal:n #1 { \if_int_compare:w #1 = \l_@@_curr_char_int \exp_after:wN \@@_break_true:w \fi: \@@_maybe_compute_ccc: \if_int_compare:w #1 = \l_@@_case_changed_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_item_caseless_range:nn #1 #2 { \reverse_if:N \if_int_compare:w #1 > \l_@@_curr_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_curr_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: \@@_maybe_compute_ccc: \reverse_if:N \if_int_compare:w #1 > \l_@@_case_changed_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_case_changed_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compute_case_changed_char:} % This function is called when \cs{l_@@_case_changed_char_int} has % not yet been computed. If the current character code is in the range % $[65,90]$ (upper-case), then add $32$, making it lowercase. If it is % in the lower-case letter range $[97,122]$, subtract $32$. % \begin{macrocode} \cs_new_protected:Npn \@@_compute_case_changed_char: { \int_set_eq:NN \l_@@_case_changed_char_int \l_@@_curr_char_int \if_int_compare:w \l_@@_curr_char_int > `Z \exp_stop_f: \if_int_compare:w \l_@@_curr_char_int > `z \exp_stop_f: \else: \if_int_compare:w \l_@@_curr_char_int < `a \exp_stop_f: \else: \int_sub:Nn \l_@@_case_changed_char_int \c_@@_ascii_lower_int \fi: \fi: \else: \if_int_compare:w \l_@@_curr_char_int < `A \exp_stop_f: \else: \int_add:Nn \l_@@_case_changed_char_int \c_@@_ascii_lower_int \fi: \fi: \cs_set_eq:NN \@@_maybe_compute_ccc: \prg_do_nothing: } \cs_new_eq:NN \@@_maybe_compute_ccc: \@@_compute_case_changed_char: % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_item_equal:n, \@@_item_range:nn} % Those must always be defined to expand to a \texttt{caseful} % (default) or \texttt{caseless} version, and not be protected: they % must expand when compiling, to hard-code which tests are caseless or % caseful. % \begin{macrocode} \cs_new_eq:NN \@@_item_equal:n ? \cs_new_eq:NN \@@_item_range:nn ? % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_item_catcode:nT, \@@_item_catcode_reverse:nT} % \begin{macro}{\@@_item_catcode:} % The argument is a sum of powers of $4$ with exponents given by the % allowed category codes (between $0$ and $13$). Dividing by a given % power of $4$ gives an odd result if and only if that category code % is allowed. If the catcode does not match, then skip the character % code tests which follow. % \begin{macrocode} \cs_new_protected:Npn \@@_item_catcode: { " \if_case:w \l_@@_curr_catcode_int 1 \or: 4 \or: 10 \or: 40 \or: 100 \or: \or: 1000 \or: 4000 \or: 10000 \or: \or: 100000 \or: 400000 \or: 1000000 \or: 4000000 \else: 1*0 \fi: } \prg_new_protected_conditional:Npnn \@@_item_catcode:n #1 { T } { \if_int_odd:w \@@_int_eval:w #1 / \@@_item_catcode: \scan_stop: \prg_return_true: \else: \prg_return_false: \fi: } \cs_new_protected:Npn \@@_item_catcode_reverse:nT #1#2 { \@@_item_catcode:nT {#1} { \@@_item_reverse:n {#2} } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_item_exact:nn, \@@_item_exact_cs:n} % This matches an exact \meta{category}-\meta{character code} pair, or % an exact control sequence, more precisely one of several possible % control sequences, separated by \cs{scan_stop:}. % \begin{macrocode} \cs_new_protected:Npn \@@_item_exact:nn #1#2 { \if_int_compare:w #1 = \l_@@_curr_catcode_int \if_int_compare:w #2 = \l_@@_curr_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } \cs_new_protected:Npn \@@_item_exact_cs:n #1 { \int_compare:nNnTF \l_@@_curr_catcode_int = \c_zero_int { \__kernel_tl_set:Nx \l_@@_internal_a_tl { \scan_stop: \@@_curr_cs_to_str: \scan_stop: } \tl_if_in:noTF { \scan_stop: #1 \scan_stop: } \l_@@_internal_a_tl { \@@_break_true:w } { } } { } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_item_cs:n} % Match a control sequence (the argument is a compiled regex). % First test the catcode of the current token to be zero. % Then perform the matching test, and break if the csname % indeed matches. % \begin{macrocode} \cs_new_protected:Npn \@@_item_cs:n #1 { \int_compare:nNnT \l_@@_curr_catcode_int = \c_zero_int { \group_begin: \@@_single_match: \@@_disable_submatches: \@@_build_for_cs:n {#1} \bool_set_eq:NN \l_@@_saved_success_bool \g_@@_success_bool \exp_args:Ne \@@_match_cs:n { \@@_curr_cs_to_str: } \if_meaning:w \c_true_bool \g_@@_success_bool \group_insert_after:N \@@_break_true:w \fi: \bool_gset_eq:NN \g_@@_success_bool \l_@@_saved_success_bool \group_end: } } % \end{macrocode} % \end{macro} % % \subsubsection{Character property tests} % % \begin{macro} % { % \@@_prop_d:, \@@_prop_h:, \@@_prop_s:, % \@@_prop_v:, \@@_prop_w:, \@@_prop_N: % } % Character property tests for |\d|, |\W|, \emph{etc.} These character % properties are not affected by the |(?i)| option. The characters % recognized by each one are as follows: |\d=[0-9]|, % |\w=[0-9A-Z_a-z]|, \verb*+\s=[\ \^^I\^^J\^^L\^^M]+, % \verb*+\h=[\ \^^I]+, |\v=[\^^J-\^^M]|, and the upper case % counterparts match anything that the lower case does not match. The % order in which the various tests appear is optimized for usual % mostly lower case letter text. % \begin{macrocode} \cs_new_protected:Npn \@@_prop_d: { \@@_item_caseful_range:nn { `0 } { `9 } } \cs_new_protected:Npn \@@_prop_h: { \@@_item_caseful_equal:n { `\ } \@@_item_caseful_equal:n { `\^^I } } \cs_new_protected:Npn \@@_prop_s: { \@@_item_caseful_equal:n { `\ } \@@_item_caseful_equal:n { `\^^I } \@@_item_caseful_equal:n { `\^^J } \@@_item_caseful_equal:n { `\^^L } \@@_item_caseful_equal:n { `\^^M } } \cs_new_protected:Npn \@@_prop_v: { \@@_item_caseful_range:nn { `\^^J } { `\^^M } } % lf, vtab, ff, cr \cs_new_protected:Npn \@@_prop_w: { \@@_item_caseful_range:nn { `a } { `z } \@@_item_caseful_range:nn { `A } { `Z } \@@_item_caseful_range:nn { `0 } { `9 } \@@_item_caseful_equal:n { `_ } } \cs_new_protected:Npn \@@_prop_N: { \@@_item_reverse:n { \@@_item_caseful_equal:n { `\^^J } } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_posix_alnum:, \@@_posix_alpha:, \@@_posix_ascii:, % \@@_posix_blank:, \@@_posix_cntrl:, \@@_posix_digit:, % \@@_posix_graph:, \@@_posix_lower:, \@@_posix_print:, % \@@_posix_punct:, \@@_posix_space:, \@@_posix_upper:, % \@@_posix_word: , \@@_posix_xdigit: % } % \textsc{posix} properties. No surprise. % \begin{macrocode} \cs_new_protected:Npn \@@_posix_alnum: { \@@_posix_alpha: \@@_posix_digit: } \cs_new_protected:Npn \@@_posix_alpha: { \@@_posix_lower: \@@_posix_upper: } \cs_new_protected:Npn \@@_posix_ascii: { \@@_item_caseful_range:nn \c_@@_ascii_min_int \c_@@_ascii_max_int } \cs_new_eq:NN \@@_posix_blank: \@@_prop_h: \cs_new_protected:Npn \@@_posix_cntrl: { \@@_item_caseful_range:nn \c_@@_ascii_min_int \c_@@_ascii_max_control_int \@@_item_caseful_equal:n \c_@@_ascii_max_int } \cs_new_eq:NN \@@_posix_digit: \@@_prop_d: \cs_new_protected:Npn \@@_posix_graph: { \@@_item_caseful_range:nn { `! } { `\~ } } \cs_new_protected:Npn \@@_posix_lower: { \@@_item_caseful_range:nn { `a } { `z } } \cs_new_protected:Npn \@@_posix_print: { \@@_item_caseful_range:nn { `\ } { `\~ } } \cs_new_protected:Npn \@@_posix_punct: { \@@_item_caseful_range:nn { `! } { `/ } \@@_item_caseful_range:nn { `: } { `@ } \@@_item_caseful_range:nn { `[ } { `` } \@@_item_caseful_range:nn { `\{ } { `\~ } } \cs_new_protected:Npn \@@_posix_space: { \@@_item_caseful_equal:n { `\ } \@@_item_caseful_range:nn { `\^^I } { `\^^M } } \cs_new_protected:Npn \@@_posix_upper: { \@@_item_caseful_range:nn { `A } { `Z } } \cs_new_eq:NN \@@_posix_word: \@@_prop_w: \cs_new_protected:Npn \@@_posix_xdigit: { \@@_posix_digit: \@@_item_caseful_range:nn { `A } { `F } \@@_item_caseful_range:nn { `a } { `f } } % \end{macrocode} % \end{macro} % % \subsubsection{Simple character escape} % % Before actually parsing the regular expression or the replacement % text, we go through them once, converting |\n| to the character $10$, % \emph{etc.} In this pass, we also convert any special character % (\texttt{*}, \texttt{?}, \texttt{\{}, etc.) or escaped alphanumeric % character into a marker indicating that this was a special sequence, % and replace escaped special characters and non-escaped alphanumeric % characters by markers indicating that those were \enquote{raw} % characters. The rest of the code can then avoid caring about escaping % issues (those can become quite complex to handle in combination with % ranges in character classes). % % Usage: \cs{@@_escape_use:nnnn} \meta{inline~1} \meta{inline~2} % \meta{inline~3} \Arg{token list} The \meta{token list} is converted to % a string, then read from left to right, interpreting backslashes as % escaping the next character. Unescaped characters are fed to the % function \meta{inline~1}, and escaped characters are fed to the function % \meta{inline~2} within an \texttt{e}-expansion context (typically those % functions perform some tests on their argument to decide how to output % them). The escape sequences |\a|, |\e|, |\f|, |\n|, |\r|, |\t| and % |\x| are recognized, and those are replaced by the corresponding % character, then fed to \meta{inline~3}. The result is then left in the % input stream. Spaces are ignored unless escaped. % % The conversion is done within an \texttt{e}-expanding assignment. % % \begin{macro}{\@@_escape_use:nnnn} % The result is built in \cs{l_@@_internal_a_tl}, which is then left % in the input stream. Tracing code is added as appropriate inside % this token list. Go through |#4| once, applying |#1|, % |#2|, or |#3| as relevant to each character (after de-escaping % it). % \begin{macrocode} \cs_new_protected:Npn \@@_escape_use:nnnn #1#2#3#4 { \group_begin: \tl_clear:N \l_@@_internal_a_tl \cs_set:Npn \@@_escape_unescaped:N ##1 { #1 } \cs_set:Npn \@@_escape_escaped:N ##1 { #2 } \cs_set:Npn \@@_escape_raw:N ##1 { #3 } \@@_standard_escapechar: \__kernel_tl_gset:Nx \g_@@_internal_tl { \__kernel_str_to_other_fast:n {#4} } \tl_put_right:Ne \l_@@_internal_a_tl { \exp_after:wN \@@_escape_loop:N \g_@@_internal_tl \scan_stop: \prg_break_point: } \exp_after:wN \group_end: \l_@@_internal_a_tl } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_escape_loop:N} % \begin{macro}+\@@_escape_\:w+ % \cs{@@_escape_loop:N} reads one character: if it is special % (space, backslash, or end-marker), perform the associated action, % otherwise it is simply an unescaped character. After a backslash, % the same is done, but unknown characters are \enquote{escaped}. % \begin{macrocode} \cs_new:Npn \@@_escape_loop:N #1 { \cs_if_exist_use:cF { @@_escape_\token_to_str:N #1:w } { \@@_escape_unescaped:N #1 } \@@_escape_loop:N } \cs_new:cpn { @@_escape_ \c_backslash_str :w } \@@_escape_loop:N #1 { \cs_if_exist_use:cF { @@_escape_/\token_to_str:N #1:w } { \@@_escape_escaped:N #1 } \@@_escape_loop:N } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro} % {\@@_escape_unescaped:N, \@@_escape_escaped:N, \@@_escape_raw:N} % Those functions are never called before being given a new meaning, % so their definitions here don't matter. % \begin{macrocode} \cs_new_eq:NN \@@_escape_unescaped:N ? \cs_new_eq:NN \@@_escape_escaped:N ? \cs_new_eq:NN \@@_escape_raw:N ? % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_escape_\scan_stop::w, \@@_escape_/\scan_stop::w, % \@@_escape_/a:w, \@@_escape_/e:w, \@@_escape_/f:w, % \@@_escape_/n:w, \@@_escape_/r:w, \@@_escape_/t:w % } % \begin{macro}+\@@_escape_ :w+ % The loop is ended upon seeing the end-marker % \enquote{\texttt{break}}, with an error if the string ended in a % backslash. Spaces are ignored, and |\a|, |\e|, |\f|, |\n|, |\r|, % |\t| take their meaning here. % \begin{macrocode} \cs_new_eq:cN { @@_escape_ \iow_char:N\\scan_stop: :w } \prg_break: \cs_new:cpn { @@_escape_/ \iow_char:N\\scan_stop: :w } { \msg_expandable_error:nn { regex } { trailing-backslash } \prg_break: } \cs_new:cpn { @@_escape_~:w } { } \cs_new:cpe { @@_escape_/a:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^G } \cs_new:cpe { @@_escape_/t:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^I } \cs_new:cpe { @@_escape_/n:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^J } \cs_new:cpe { @@_escape_/f:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^L } \cs_new:cpe { @@_escape_/r:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^M } \cs_new:cpe { @@_escape_/e:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^[ } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_escape_/x:w} % \begin{macro}{\@@_escape_x_end:w, \@@_escape_x_large:n} % When |\x| is encountered, \cs{@@_escape_x_test:N} is responsible for % grabbing some hexadecimal digits, and feeding the result to % \cs{@@_escape_x_end:w}. If the number is too big interrupt the % assignment and produce an error, otherwise call \cs{@@_escape_raw:N} % on the corresponding character token. % \begin{macrocode} \cs_new:cpn { @@_escape_/x:w } \@@_escape_loop:N { \exp_after:wN \@@_escape_x_end:w \int_value:w "0 \@@_escape_x_test:N } \cs_new:Npn \@@_escape_x_end:w #1 ; { \int_compare:nNnTF {#1} > \c_max_char_int { \msg_expandable_error:nnff { regex } { x-overflow } {#1} { \int_to_Hex:n {#1} } } { \exp_last_unbraced:Nf \@@_escape_raw:N { \char_generate:nn {#1} { 12 } } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_escape_x_test:N, \@@_escape_x_testii:N} % Find out whether the first character is a left brace (allowing any % number of hexadecimal digits), or not (allowing up to two % hexadecimal digits). We need to check for the end-of-string marker. % Eventually, call either \cs{@@_escape_x_loop:N} or % \cs{@@_escape_x:N}. % \begin{macrocode} \cs_new:Npn \@@_escape_x_test:N #1 { \if_meaning:w \scan_stop: #1 \exp_after:wN \use_i:nnn \exp_after:wN ; \fi: \use:n { \if_charcode:w \c_space_token #1 \exp_after:wN \@@_escape_x_test:N \else: \exp_after:wN \@@_escape_x_testii:N \exp_after:wN #1 \fi: } } \cs_new:Npn \@@_escape_x_testii:N #1 { \if_charcode:w \c_left_brace_str #1 \exp_after:wN \@@_escape_x_loop:N \else: \@@_hexadecimal_use:NTF #1 { \exp_after:wN \@@_escape_x:N } { ; \exp_after:wN \@@_escape_loop:N \exp_after:wN #1 } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_escape_x:N} % This looks for the second digit in the unbraced case. % \begin{macrocode} \cs_new:Npn \@@_escape_x:N #1 { \if_meaning:w \scan_stop: #1 \exp_after:wN \use_i:nnn \exp_after:wN ; \fi: \use:n { \@@_hexadecimal_use:NTF #1 { ; \@@_escape_loop:N } { ; \@@_escape_loop:N #1 } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_escape_x_loop:N, \@@_escape_x_loop_error:} % Grab hexadecimal digits, skip spaces, and at the end, check that % there is a right brace, otherwise raise an error outside the % assignment. % \begin{macrocode} \cs_new:Npn \@@_escape_x_loop:N #1 { \if_meaning:w \scan_stop: #1 \exp_after:wN \use_ii:nnn \fi: \use_ii:nn { ; \@@_escape_x_loop_error:n { } {#1} } { \@@_hexadecimal_use:NTF #1 { \@@_escape_x_loop:N } { \token_if_eq_charcode:NNTF \c_space_token #1 { \@@_escape_x_loop:N } { ; \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #1 { \@@_escape_loop:N } { \@@_escape_x_loop_error:n {#1} } } } } } \cs_new:Npn \@@_escape_x_loop_error:n #1 { \msg_expandable_error:nnn { regex } { x-missing-rbrace } {#1} \@@_escape_loop:N #1 } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_hexadecimal_use:NTF} % \TeX{} detects uppercase hexadecimal digits for us but not the % lowercase letters, which we need to detect and replace by their % uppercase counterpart. % \begin{macrocode} \cs_new:Npn \@@_hexadecimal_use:NTF #1 { \if_int_compare:w \c_one_int < "1 \token_to_str:N #1 \exp_stop_f: #1 \else: \if_case:w \@@_int_eval:w \exp_after:wN ` \token_to_str:N #1 - `a \scan_stop: A \or: B \or: C \or: D \or: E \or: F \else: \exp_after:wN \exp_after:wN \exp_after:wN \use_iii:nnn \fi: \fi: \use_i:nn } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP] % {\@@_char_if_alphanumeric:NTF, \@@_char_if_special:NTF} % These two tests are used in the first pass when parsing a regular % expression. That pass is responsible for finding escaped and % non-escaped characters, and recognizing which ones have special % meanings and which should be interpreted as \enquote{raw} % characters. Namely, % \begin{itemize} % \item alphanumerics are \enquote{raw} if they are not escaped, and % may have a special meaning when escaped; % \item non-alphanumeric printable ascii characters are % \enquote{raw} if they are escaped, and may have a special % meaning when not escaped; % \item characters other than printable ascii are always % \enquote{raw}. % \end{itemize} % The code is ugly, and highly based on magic numbers and the ascii % codes of characters. This is mostly unavoidable for performance % reasons. Maybe the tests can be optimized a little bit more. % Here, \enquote{alphanumeric} means \texttt{0}--\texttt{9}, % \texttt{A}--\texttt{Z}, \texttt{a}--\texttt{z}; % \enquote{special} character means non-alphanumeric % but printable ascii, from space (hex \texttt{20}) to % \texttt{del} (hex \texttt{7E}). % \begin{macrocode} \prg_new_conditional:Npnn \@@_char_if_special:N #1 { TF } { \if:w T \if_int_compare:w `#1 > `Z \exp_stop_f: \if_int_compare:w `#1 > `z \exp_stop_f: \if_int_compare:w `#1 < \c_@@_ascii_max_int \else: F \fi: \else: \if_int_compare:w `#1 < `a \exp_stop_f: \else: F \fi: \fi: \else: \if_int_compare:w `#1 > `9 \exp_stop_f: \if_int_compare:w `#1 < `A \exp_stop_f: \else: F \fi: \else: \if_int_compare:w `#1 < `0 \exp_stop_f: \if_int_compare:w `#1 < `\ \exp_stop_f: F \fi: \else: F \fi: \fi: \fi: T \prg_return_true: \else: \prg_return_false: \fi: } \prg_new_conditional:Npnn \@@_char_if_alphanumeric:N #1 { TF } { \if:w T \if_int_compare:w `#1 > `Z \exp_stop_f: \if_int_compare:w `#1 > `z \exp_stop_f: F \else: \if_int_compare:w `#1 < `a \exp_stop_f: F \fi: \fi: \else: \if_int_compare:w `#1 > `9 \exp_stop_f: \if_int_compare:w `#1 < `A \exp_stop_f: F \fi: \else: \if_int_compare:w `#1 < `0 \exp_stop_f: F \fi: \fi: \fi: T \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \subsection{Compiling} % % A regular expression starts its life as a string of characters. In % this section, we convert it to internal instructions, resulting in a % \enquote{compiled} regular expression. This compiled expression is % then turned into states of an automaton in the building % phase. Compiled regular expressions consist of the following: % \begin{itemize} % \item \cs{@@_class:NnnnN} \meta{boolean} \Arg{tests} \Arg{min} % \Arg{more} \meta{laziness} % \item \cs{@@_group:nnnN} \Arg{branches} \Arg{min} \Arg{more} % \meta{laziness}, also \cs{@@_group_no_capture:nnnN} and % \cs{@@_group_resetting:nnnN} with the same syntax. % \item \cs{@@_branch:n} \Arg{contents} % \item \cs{@@_command_K:} % \item \cs{@@_assertion:Nn} \meta{boolean} \Arg{assertion test}, % where the \meta{assertion test} is \cs{@@_b_test:} or % \cs{@@_Z_test:} or \cs{@@_A_test:} or \cs{@@_G_test:} % \end{itemize} % Tests can be the following: % \begin{itemize} % \item \cs{@@_item_caseful_equal:n} \Arg{char code} % \item \cs{@@_item_caseless_equal:n} \Arg{char code} % \item \cs{@@_item_caseful_range:nn} \Arg{min} \Arg{max} % \item \cs{@@_item_caseless_range:nn} \Arg{min} \Arg{max} % \item \cs{@@_item_catcode:nT} \Arg{catcode bitmap} \Arg{tests} % \item \cs{@@_item_catcode_reverse:nT} \Arg{catcode bitmap} \Arg{tests} % \item \cs{@@_item_reverse:n} \Arg{tests} % \item \cs{@@_item_exact:nn} \Arg{catcode} \Arg{char code} % \item \cs{@@_item_exact_cs:n} \Arg{csnames}, more precisely given as % \meta{csname} \cs{scan_stop:} \meta{csname} \cs{scan_stop:} % \meta{csname} and so on in a brace group. % \item \cs{@@_item_cs:n} \Arg{compiled regex} % \end{itemize} % % \subsubsection{Variables used when compiling} % % \begin{variable}{\l_@@_group_level_int} % We make sure to open the same number of groups as we close. % \begin{macrocode} \int_new:N \l_@@_group_level_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_mode_int} % \begin{variable} % { % \c_@@_cs_in_class_mode_int, % \c_@@_cs_mode_int, % \c_@@_outer_mode_int, % \c_@@_catcode_mode_int, % \c_@@_class_mode_int, % \c_@@_catcode_in_class_mode_int % } % While compiling, ten modes are recognized, labelled $-63$, $-23$, % $-6$, $-2$, $0$, $2$, $3$, $6$, $23$, $63$. See % section~\ref{sec:regex-modes}. We only define some of these as % constants. % \begin{macrocode} \int_new:N \l_@@_mode_int \int_const:Nn \c_@@_cs_in_class_mode_int { -6 } \int_const:Nn \c_@@_cs_mode_int { -2 } \int_const:Nn \c_@@_outer_mode_int { 0 } \int_const:Nn \c_@@_catcode_mode_int { 2 } \int_const:Nn \c_@@_class_mode_int { 3 } \int_const:Nn \c_@@_catcode_in_class_mode_int { 6 } % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_catcodes_int, \l_@@_default_catcodes_int} % \begin{variable}{\l_@@_catcodes_bool} % We wish to allow constructions such as |\c[^BE](..\cL[a-z]..)|, % where the outer catcode test applies to the whole group, but is % superseded by the inner catcode test. For this to work, we need to % keep track of lists of allowed category codes: % \cs{l_@@_catcodes_int} and \cs{l_@@_default_catcodes_int} are % bitmaps, sums of $4^c$, for all allowed catcodes $c$. The latter is % local to each capturing group, and we reset % \cs{l_@@_catcodes_int} to that value after each character or % class, changing it only when encountering a |\c| escape. The boolean % records whether the list of categories of a catcode test has to be % inverted: compare |\c[^BE]| and |\c[BE]|. % \begin{macrocode} \int_new:N \l_@@_catcodes_int \int_new:N \l_@@_default_catcodes_int \bool_new:N \l_@@_catcodes_bool % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable} % { % \c_@@_catcode_C_int, \c_@@_catcode_B_int, \c_@@_catcode_E_int, % \c_@@_catcode_M_int, \c_@@_catcode_T_int, \c_@@_catcode_P_int, % \c_@@_catcode_U_int, \c_@@_catcode_D_int, \c_@@_catcode_S_int, % \c_@@_catcode_L_int, \c_@@_catcode_O_int, \c_@@_catcode_A_int % } % \begin{variable}{\c_@@_all_catcodes_int} % Constants: $4^c$ for each category, and the sum of all powers of $4$. % \begin{macrocode} \int_const:Nn \c_@@_catcode_C_int { "1 } \int_const:Nn \c_@@_catcode_B_int { "4 } \int_const:Nn \c_@@_catcode_E_int { "10 } \int_const:Nn \c_@@_catcode_M_int { "40 } \int_const:Nn \c_@@_catcode_T_int { "100 } \int_const:Nn \c_@@_catcode_P_int { "1000 } \int_const:Nn \c_@@_catcode_U_int { "4000 } \int_const:Nn \c_@@_catcode_D_int { "10000 } \int_const:Nn \c_@@_catcode_S_int { "100000 } \int_const:Nn \c_@@_catcode_L_int { "400000 } \int_const:Nn \c_@@_catcode_O_int { "1000000 } \int_const:Nn \c_@@_catcode_A_int { "4000000 } \int_const:Nn \c_@@_all_catcodes_int { "5515155 } % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_internal_regex} % The compilation step stores its result in this variable. % \begin{macrocode} \cs_new_eq:NN \l_@@_internal_regex \c_@@_no_match_regex % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_show_prefix_seq} % This sequence holds the prefix that makes up the line displayed to % the user. The various items must be removed from the right, which is % tricky with a token list, hence we use a sequence. % \begin{macrocode} \seq_new:N \l_@@_show_prefix_seq % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_show_lines_int} % A hack. To know whether a given class has a single item in it or % not, we count the number of lines when showing the class. % \begin{macrocode} \int_new:N \l_@@_show_lines_int % \end{macrocode} % \end{variable} % % \subsubsection{Generic helpers used when compiling} % % \begin{macro}{\@@_two_if_eq:NNNNTF} % Used to compare pairs of things like \cs{@@_compile_special:N} |?| % together. It's often inconvenient to get the catcodes of the % character to match so we just compare the character code. % Besides, the expanding behaviour of \cs{if:w} is very useful as that % means we can use \cs{c_left_brace_str} and the like. % \begin{macrocode} \cs_new:Npn \@@_two_if_eq:NNNNTF #1#2#3#4 { \if_meaning:w #1 #3 \if:w #2 #4 \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nnn \fi: \fi: \use_ii:nn } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_get_digits:NTFw} % \begin{macro}[rEXP]{\@@_get_digits_loop:w} % If followed by some raw digits, collect them one by one in the % integer variable |#1|, and take the \texttt{true} branch. Otherwise, % take the \texttt{false} branch. % \begin{macrocode} \cs_new_protected:Npn \@@_get_digits:NTFw #1#2#3#4#5 { \@@_if_raw_digit:NNTF #4 #5 { #1 = #5 \@@_get_digits_loop:nw {#2} } { #3 #4 #5 } } \cs_new:Npn \@@_get_digits_loop:nw #1#2#3 { \@@_if_raw_digit:NNTF #2 #3 { #3 \@@_get_digits_loop:nw {#1} } { \scan_stop: #1 #2 #3 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_if_raw_digit:NNTF} % Test used when grabbing digits for the |{m,n}| quantifier. % It only accepts non-escaped digits. % \begin{macrocode} \cs_new:Npn \@@_if_raw_digit:NNTF #1#2 { \if_meaning:w \@@_compile_raw:N #1 \if_int_compare:w \c_one_int < 1 #2 \exp_stop_f: \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nnn \fi: \fi: \use_ii:nn } % \end{macrocode} % \end{macro} % % \subsubsection{Mode} % \label{sec:regex-modes} % % When compiling the \textsc{nfa} corresponding to a given regex string, % we can be in ten distinct modes, which we label by some magic numbers: % \begin{itemize} % \item[-6] |[\c{...}]| control sequence in a class, % \item[-2] |\c{...}| control sequence, % \item[0] |...| outer, % \item[2] |\c...| catcode test, % \item[6] |[\c...]| catcode test in a class, % \item[-63] |[\c{[...]}]| class inside mode $-6$, % \item[-23] |\c{[...]}| class inside mode $-2$, % \item[3] |[...]| class inside mode $0$, % \item[23] |\c[...]| class inside mode $2$, % \item[63] |[\c[...]]| class inside mode $6$. % \end{itemize} % This list is exhaustive, because |\c| escape sequences cannot be % nested, and character classes cannot be nested directly. The choice of % numbers is such as to optimize the most useful tests, and make % transitions from one mode to another as simple as possible. % \begin{itemize} % \item Even modes mean that we are not directly in a character class. % In this case, a left bracket appends $3$ to the mode. In a % character class, a right bracket changes the mode as $m\to % (m-15)/13$, truncated. % \item Grouping, assertion, and anchors are allowed in non-positive % even modes ($0$, $-2$, $-6$), and do not change the % mode. Otherwise, they trigger an error. % \item A left bracket is special in even modes, appending $3$ to the % mode; in those modes, quantifiers and the dot are recognized, and % the right bracket is normal. In odd modes (within classes), the % left bracket is normal, but the right bracket ends the class, % changing the mode from $m$ to $(m-15)/13$, truncated; also, ranges % are recognized. % \item In non-negative modes, left and right braces are normal. In % negative modes, however, left braces trigger a warning; right % braces end the control sequence, going from $-2$ to $0$ or $-6$ to % $3$, with error recovery for odd modes. % \item Properties (such as the |\d| character class) can appear in % any mode. % \end{itemize} % % \begin{macro}[EXP]{\@@_if_in_class:TF} % Test whether we are directly in a character class (at the innermost % level of nesting). There, many escape sequences are not recognized, % and special characters are normal. Also, for every raw character, we % must look ahead for a possible raw dash. % \begin{macrocode} \prg_new_conditional:Npnn \@@_if_in_class: { TF } { \if_int_odd:w \l_@@_mode_int \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_if_in_cs:TF} % Right braces are special only directly inside control sequences (at % the inner-most level of nesting, not counting groups). % \begin{macrocode} \cs_new:Npn \@@_if_in_cs:TF { \if_int_odd:w \l_@@_mode_int \else: \if_int_compare:w \l_@@_mode_int < \c_@@_outer_mode_int \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nnn \fi: \fi: \use_ii:nn } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_if_in_class_or_catcode:TF} % Assertions are only allowed in modes $0$, $-2$, and $-6$, % \emph{i.e.}, even, non-positive modes. % \begin{macrocode} \cs_new:Npn \@@_if_in_class_or_catcode:TF { \if_int_odd:w \l_@@_mode_int \else: \if_int_compare:w \l_@@_mode_int > \c_@@_outer_mode_int \else: \exp_after:wN \exp_after:wN \exp_after:wN \use_iii:nnn \fi: \fi: \use_i:nn } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_if_within_catcode:TF} % This test takes the true branch if we are in a catcode test, either % immediately following it (modes $2$ and $6$) or in a class on which % it applies (modes $23$ and $63$). This is used to tweak how left % brackets behave in modes $2$ and $6$. % \begin{macrocode} \prg_new_conditional:Npnn \@@_if_within_catcode: { TF } { \if_int_compare:w \l_@@_mode_int > \c_@@_outer_mode_int \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_chk_c_allowed:T} % The |\c| escape sequence is only allowed in modes $0$ and $3$, % \emph{i.e.}, not within any other |\c| escape sequence. % \begin{macrocode} \cs_new_protected:Npn \@@_chk_c_allowed:T { \if_int_compare:w \l_@@_mode_int = \c_@@_outer_mode_int \else: \if_int_compare:w \l_@@_mode_int = \c_@@_class_mode_int \else: \msg_error:nn { regex } { c-bad-mode } \exp_after:wN \use_i:nnn \fi: \fi: \use:n } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_mode_quit_c:} % This function changes the mode as it is needed just after a catcode % test. % \begin{macrocode} \cs_new_protected:Npn \@@_mode_quit_c: { \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_mode_int \int_set_eq:NN \l_@@_mode_int \c_@@_outer_mode_int \else: \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_in_class_mode_int \int_set_eq:NN \l_@@_mode_int \c_@@_class_mode_int \fi: \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Framework} % % \begin{macro}{\@@_compile:w, \@@_compile_end:} % Used when compiling a user regex or a regex for the |\c{...}| escape % sequence within another regex. Start building a token list within a % group (with \texttt{e}-expansion at the outset), and set a few % variables (group level, catcodes), then start the first branch. At % the end, make sure there are no dangling classes nor groups, close % the last branch: we are done building \cs{l_@@_internal_regex}. % \begin{macrocode} \cs_new_protected:Npn \@@_compile:w { \group_begin: \tl_build_begin:N \l_@@_build_tl \int_zero:N \l_@@_group_level_int \int_set_eq:NN \l_@@_default_catcodes_int \c_@@_all_catcodes_int \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \cs_set:Npn \@@_item_equal:n { \@@_item_caseful_equal:n } \cs_set:Npn \@@_item_range:nn { \@@_item_caseful_range:nn } \tl_build_put_right:Nn \l_@@_build_tl { \@@_branch:n { \if_false: } \fi: } } \cs_new_protected:Npn \@@_compile_end: { \@@_if_in_class:TF { \msg_error:nn { regex } { missing-rbrack } \use:c { @@_compile_]: } \prg_do_nothing: \prg_do_nothing: } { } \if_int_compare:w \l_@@_group_level_int > \c_zero_int \msg_error:nne { regex } { missing-rparen } { \int_use:N \l_@@_group_level_int } \prg_replicate:nn \l_@@_group_level_int { \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } \if_false: { \fi: } { 1 } { 0 } \c_true_bool } \tl_build_end:N \l_@@_build_tl \exp_args:NNNo \group_end: \tl_build_put_right:Nn \l_@@_build_tl { \l_@@_build_tl } } \fi: \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } } \tl_build_end:N \l_@@_build_tl \exp_args:NNNe \group_end: \tl_set:Nn \l_@@_internal_regex { \l_@@_build_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile:n} % The compilation is done between \cs{@@_compile:w} and % \cs{@@_compile_end:}, starting in mode~$0$. Then % \cs{@@_escape_use:nnnn} distinguishes special characters, escaped % alphanumerics, and raw characters, interpreting |\a|, |\x| and other % sequences. The $4$ trailing \cs{prg_do_nothing:} are needed because % some functions defined later look up to $4$ tokens ahead. Before % ending, make sure that any |\c{...}| is properly closed. No need to % check that brackets are closed properly since \cs{@@_compile_end:} % does that. However, catch the case of a trailing |\cL| % construction. % \begin{macrocode} \cs_new_protected:Npn \@@_compile:n #1 { \@@_compile:w \@@_standard_escapechar: \int_set_eq:NN \l_@@_mode_int \c_@@_outer_mode_int \@@_escape_use:nnnn { \@@_char_if_special:NTF ##1 \@@_compile_special:N \@@_compile_raw:N ##1 } { \@@_char_if_alphanumeric:NTF ##1 \@@_compile_escaped:N \@@_compile_raw:N ##1 } { \@@_compile_raw:N ##1 } { #1 } \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \int_compare:nNnT \l_@@_mode_int = \c_@@_catcode_mode_int { \msg_error:nn { regex } { c-trailing } } \int_compare:nNnT \l_@@_mode_int < \c_@@_outer_mode_int { \msg_error:nn { regex } { c-missing-rbrace } \@@_compile_end_cs: \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: } \@@_compile_end: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_use:n} % Use a regex, regardless of whether it is given as a string (in which % case we need to compile) or as a regex variable. This is used for % \cs{regex_match_case:nn} and related functions to allow a mixture of % explicit regex and regex variables. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_use:n #1 { \tl_if_single_token:nT {#1} { \exp_after:wN \@@_compile_use_aux:w \token_to_meaning:N #1 ~ \q_@@_nil } \@@_compile:n {#1} \l_@@_internal_regex } \cs_new_protected:Npn \@@_compile_use_aux:w #1 ~ #2 \q_@@_nil { \str_if_eq:nnT { #1 ~ } { macro:->\@@_branch:n } { \use_ii:nnn } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_escaped:N, \@@_compile_special:N} % If the special character or escaped alphanumeric has a particular % meaning in regexes, the corresponding function is used. Otherwise, % it is interpreted as a raw character. We distinguish special % characters from escaped alphanumeric characters because they behave % differently when appearing as an end-point of a range. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_special:N #1 { \cs_if_exist_use:cF { @@_compile_#1: } { \@@_compile_raw:N #1 } } \cs_new_protected:Npn \@@_compile_escaped:N #1 { \cs_if_exist_use:cF { @@_compile_/#1: } { \@@_compile_raw:N #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_one:n} % This is used after finding one \enquote{test}, such as |\d|, or a % raw character. If that followed a catcode test (\emph{e.g.}, |\cL|), % then restore the mode. If we are not in a class, then the test is % \enquote{standalone}, and we need to add \cs{@@_class:NnnnN} and % search for quantifiers. In any case, insert the test, possibly % together with a catcode test if appropriate. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_one:n #1 { \@@_mode_quit_c: \@@_if_in_class:TF { } { \tl_build_put_right:Nn \l_@@_build_tl { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: } } \tl_build_put_right:Ne \l_@@_build_tl { \if_int_compare:w \l_@@_catcodes_int < \c_@@_all_catcodes_int \@@_item_catcode:nT { \int_use:N \l_@@_catcodes_int } { \exp_not:N \exp_not:n {#1} } \else: \exp_not:N \exp_not:n {#1} \fi: } \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \@@_if_in_class:TF { } { \@@_compile_quantifier:w } } % \end{macrocode} % \end{macro} % % \begin{macro} % {\@@_compile_abort_tokens:n, \@@_compile_abort_tokens:e} % This function places the collected tokens back in the input stream, % each as a raw character. Spaces are not preserved. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_abort_tokens:n #1 { \use:e { \exp_args:No \tl_map_function:nN { \tl_to_str:n {#1} } \@@_compile_raw:N } } \cs_generate_variant:Nn \@@_compile_abort_tokens:n { e } % \end{macrocode} % \end{macro} % % \subsubsection{Quantifiers} % % \begin{macro}{\@@_compile_if_quantifier:TFw} % This looks ahead and checks whether there are any quantifier % (special character equal to either of \texttt{?+*\{}). This is % useful for the |\u| and |\ur| escape sequences. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_if_quantifier:TFw #1#2#3#4 { \token_if_eq_meaning:NNTF #3 \@@_compile_special:N { \cs_if_exist:cTF { @@_compile_quantifier_#4:w } } { \use_ii:nn } {#1} {#2} #3 #4 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_quantifier:w} % This looks ahead and finds any quantifier (special character equal % to either of \texttt{?+*\{}). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier:w #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_special:N { \cs_if_exist_use:cF { @@_compile_quantifier_#2:w } { \@@_compile_quantifier_none: #1 #2 } } { \@@_compile_quantifier_none: #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_quantifier_none:} % \begin{macro}{\@@_compile_quantifier_abort:eNN} % Those functions are called whenever there is no quantifier, or a % braced construction is invalid (equivalent to no quantifier, and % whatever characters were grabbed are left raw). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier_none: { \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } { 1 } { 0 } \c_false_bool } } \cs_new_protected:Npn \@@_compile_quantifier_abort:eNN #1#2#3 { \@@_compile_quantifier_none: \msg_warning:nnee { regex } { invalid-quantifier } {#1} {#3} \@@_compile_abort_tokens:e {#1} #2 #3 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_compile_quantifier_laziness:nnNN} % Once the \enquote{main} quantifier (\texttt{?}, \texttt{*}, % \texttt{+} or a braced construction) is found, we check whether it % is lazy (followed by a question mark). We then add to the compiled % regex a closing brace (ending \cs{@@_class:NnnnN} and friends), % the start-point of the range, its end-point, and a boolean, % \texttt{true} for lazy and \texttt{false} for greedy operators. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier_laziness:nnNN #1#2#3#4 { \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ? { \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } { #1 } { #2 } \c_true_bool } } { \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } { #1 } { #2 } \c_false_bool } #3 #4 } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_compile_quantifier_?:w, % \@@_compile_quantifier_*:w, % \@@_compile_quantifier_+:w % } % For each \enquote{basic} quantifier, |?|, |*|, |+|, feed the correct % arguments to \cs{@@_compile_quantifier_laziness:nnNN}, $-1$ means % that there is no upper bound on the number of repetitions. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_quantifier_?:w } { \@@_compile_quantifier_laziness:nnNN { 0 } { 1 } } \cs_new_protected:cpn { @@_compile_quantifier_*:w } { \@@_compile_quantifier_laziness:nnNN { 0 } { -1 } } \cs_new_protected:cpn { @@_compile_quantifier_+:w } { \@@_compile_quantifier_laziness:nnNN { 1 } { -1 } } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_quantifier_{:w+ ^^A} % \begin{macro} % { % \@@_compile_quantifier_braced_auxi:w, % \@@_compile_quantifier_braced_auxii:w, % \@@_compile_quantifier_braced_auxiii:w, % } % Three possible syntaxes: \texttt{\{\meta{int}\}}, % \texttt{\{\meta{int},\}}, or \texttt{\{\meta{int},\meta{int}\}}. Any % other syntax causes us to abort and put whatever we collected back % in the input stream, as \texttt{raw} characters, including the % opening brace. Grab a number into \cs{l_@@_internal_a_int}. If % the number is followed by a right brace, the range is $[a,a]$. If % followed by a comma, grab one more number, and call the \texttt{_ii} % or \texttt{_iii} auxiliary. Those auxiliaries check for a closing % brace, leading to the range $[a,\infty]$ or $[a,b]$, encoded as % $\{a\}\{-1\}$ and $\{a\}\{b-a\}$. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_quantifier_ \c_left_brace_str :w } { \@@_get_digits:NTFw \l_@@_internal_a_int { \@@_compile_quantifier_braced_auxi:w } { \@@_compile_quantifier_abort:eNN { \c_left_brace_str } } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxi:w #1#2 { \str_case_e:nnF { #1 #2 } { { \@@_compile_special:N \c_right_brace_str } { \exp_args:No \@@_compile_quantifier_laziness:nnNN { \int_use:N \l_@@_internal_a_int } 0 } { \@@_compile_special:N , } { \@@_get_digits:NTFw \l_@@_internal_b_int { \@@_compile_quantifier_braced_auxiii:w } { \@@_compile_quantifier_braced_auxii:w } } } { \@@_compile_quantifier_abort:eNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int } #1 #2 } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxii:w #1#2 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_right_brace_str { \exp_args:No \@@_compile_quantifier_laziness:nnNN { \int_use:N \l_@@_internal_a_int } { -1 } } { \@@_compile_quantifier_abort:eNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int , } #1 #2 } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxiii:w #1#2 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_right_brace_str { \if_int_compare:w \l_@@_internal_a_int > \l_@@_internal_b_int \msg_error:nnee { regex } { backwards-quantifier } { \int_use:N \l_@@_internal_a_int } { \int_use:N \l_@@_internal_b_int } \int_zero:N \l_@@_internal_b_int \else: \int_sub:Nn \l_@@_internal_b_int \l_@@_internal_a_int \fi: \exp_args:Noo \@@_compile_quantifier_laziness:nnNN { \int_use:N \l_@@_internal_a_int } { \int_use:N \l_@@_internal_b_int } } { \@@_compile_quantifier_abort:eNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int , \int_use:N \l_@@_internal_b_int } #1 #2 } } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Raw characters} % % \begin{macro}{\@@_compile_raw_error:N} % Within character classes, and following catcode tests, some escaped % alphanumeric sequences such as |\b| do not have any meaning. They % are replaced by a raw character, after spitting out an error. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_raw_error:N #1 { \msg_error:nne { regex } { bad-escape } {#1} \@@_compile_raw:N #1 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_raw:N} % If we are in a character class and the next character is an % unescaped dash, this denotes a range. Otherwise, the current % character |#1| matches itself. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_raw:N #1#2#3 { \@@_if_in_class:TF { \@@_two_if_eq:NNNNTF #2 #3 \@@_compile_special:N - { \@@_compile_range:Nw #1 } { \@@_compile_one:n { \@@_item_equal:n { \int_value:w `#1 } } #2 #3 } } { \@@_compile_one:n { \@@_item_equal:n { \int_value:w `#1 } } #2 #3 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_range:Nw, \@@_if_end_range:NNTF} % We have just read a raw character followed by a dash; this should be % followed by an end-point for the range. Valid end-points are: any % raw character; any special character, except a right bracket. In % particular, escaped characters are forbidden. % \begin{macrocode} \cs_new_protected:Npn \@@_if_end_range:NNTF #1#2 { \if_meaning:w \@@_compile_raw:N #1 \else: \if_meaning:w \@@_compile_special:N #1 \if_charcode:w ] #2 \use_i:nn \fi: \else: \exp_after:wN \exp_after:wN \exp_after:wN \use_iii:nnn \fi: \fi: \use_i:nn } \cs_new_protected:Npn \@@_compile_range:Nw #1#2#3 { \@@_if_end_range:NNTF #2 #3 { \if_int_compare:w `#1 > `#3 \exp_stop_f: \msg_error:nnee { regex } { range-backwards } {#1} {#3} \else: \tl_build_put_right:Ne \l_@@_build_tl { \if_int_compare:w `#1 = `#3 \exp_stop_f: \@@_item_equal:n \else: \@@_item_range:nn { \int_value:w `#1 } \fi: { \int_value:w `#3 } } \fi: } { \msg_warning:nnee { regex } { range-missing-end } {#1} { \c_backslash_str #3 } \tl_build_put_right:Ne \l_@@_build_tl { \@@_item_equal:n { \int_value:w `#1 \exp_stop_f: } \@@_item_equal:n { \int_value:w `- \exp_stop_f: } } #2#3 } } % \end{macrocode} % \end{macro} % % \subsubsection{Character properties} % % \begin{macro}{\@@_compile_.:, \@@_prop_.:} % In a class, the dot has no special meaning. Outside, insert % \cs{@@_prop_.:}, which matches any character or control % sequence, and refuses $-2$ (end-marker). % \begin{macrocode} \cs_new_protected:cpe { @@_compile_.: } { \exp_not:N \@@_if_in_class:TF { \@@_compile_raw:N . } { \@@_compile_one:n \exp_not:c { @@_prop_.: } } } \cs_new_protected:cpn { @@_prop_.: } { \if_int_compare:w \l_@@_curr_char_int > - 2 \exp_stop_f: \exp_after:wN \@@_break_true:w \fi: } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_compile_/d:, \@@_compile_/D:, % \@@_compile_/h:, \@@_compile_/H:, % \@@_compile_/s:, \@@_compile_/S:, % \@@_compile_/v:, \@@_compile_/V:, % \@@_compile_/w:, \@@_compile_/W:, % \@@_compile_/N:, % } % The constants \cs{@@_prop_d:}, \emph{etc.} hold % a list of tests which match the corresponding character % class, and jump to the \cs{@@_break_point:TF} marker. % As for a normal character, we check for quantifiers. % \begin{macrocode} \cs_set_protected:Npn \@@_tmp:w #1#2 { \cs_new_protected:cpe { @@_compile_/#1: } { \@@_compile_one:n \exp_not:c { @@_prop_#1: } } \cs_new_protected:cpe { @@_compile_/#2: } { \@@_compile_one:n { \@@_item_reverse:n { \exp_not:c { @@_prop_#1: } } } } } \@@_tmp:w d D \@@_tmp:w h H \@@_tmp:w s S \@@_tmp:w v V \@@_tmp:w w W \cs_new_protected:cpn { @@_compile_/N: } { \@@_compile_one:n \@@_prop_N: } % \end{macrocode} % \end{macro} % % \subsubsection{Anchoring and simple assertions} % % \begin{macro}{\@@_compile_anchor_letter:NNN} % \begin{macro}{\@@_compile_/A:, \@@_compile_/G:, \@@_compile_/Z:, \@@_compile_/z:, \@@_compile_/b:, \@@_compile_/B:} % \begin{macro}+\@@_compile_^:+ % \begin{macro}+\@@_compile_$:+ % In modes where assertions are forbidden, anchors such as |\A| % produce an error (|\A|~is invalid in classes); otherwise they add an % \cs{@@_assertion:Nn} test as appropriate (the only negative % assertion is~|\B|). The test functions are defined later. The % implementation for % |$| and |^| is only different from |\A| etc because these are valid % in a class. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_anchor_letter:NNN #1#2#3 { \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N #1 } { \tl_build_put_right:Nn \l_@@_build_tl { \@@_assertion:Nn #2 {#3} } } } \cs_new_protected:cpn { @@_compile_/A: } { \@@_compile_anchor_letter:NNN A \c_true_bool \@@_A_test: } \cs_new_protected:cpn { @@_compile_/G: } { \@@_compile_anchor_letter:NNN G \c_true_bool \@@_G_test: } \cs_new_protected:cpn { @@_compile_/Z: } { \@@_compile_anchor_letter:NNN Z \c_true_bool \@@_Z_test: } \cs_new_protected:cpn { @@_compile_/z: } { \@@_compile_anchor_letter:NNN z \c_true_bool \@@_Z_test: } \cs_new_protected:cpn { @@_compile_/b: } { \@@_compile_anchor_letter:NNN b \c_true_bool \@@_b_test: } \cs_new_protected:cpn { @@_compile_/B: } { \@@_compile_anchor_letter:NNN B \c_false_bool \@@_b_test: } \cs_set_protected:Npn \@@_tmp:w #1#2 { \cs_new_protected:cpn { @@_compile_#1: } { \@@_if_in_class_or_catcode:TF { \@@_compile_raw:N #1 } { \tl_build_put_right:Nn \l_@@_build_tl { \@@_assertion:Nn \c_true_bool {#2} } } } } \exp_args:Ne \@@_tmp:w { \iow_char:N \^ } { \@@_A_test: } \exp_args:Ne \@@_tmp:w { \iow_char:N \$ } { \@@_Z_test: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsubsection{Character classes} % % \begin{macro}{\@@_compile_]:} % Outside a class, right brackets have no meaning. In a class, change % the mode ($m\to (m-15)/13$, truncated) to reflect the fact that we % are leaving the class. Look for quantifiers, unless we are still in % a class after leaving one (the case of |[...\cL[...]...]|). % quantifiers. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_]: } { \@@_if_in_class:TF { \if_int_compare:w \l_@@_mode_int > \c_@@_catcode_in_class_mode_int \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } } \fi: \tex_advance:D \l_@@_mode_int - 15 \exp_stop_f: \tex_divide:D \l_@@_mode_int 13 \exp_stop_f: \if_int_odd:w \l_@@_mode_int \else: \exp_after:wN \@@_compile_quantifier:w \fi: } { \@@_compile_raw:N ] } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_[:} % In a class, left brackets might introduce a \textsc{posix} character % class, or mean nothing. Immediately following |\c|\meta{category}, % we must insert the appropriate catcode test, then parse the class; we % pre-expand the catcode as an optimization. Otherwise (modes $0$, % $-2$ and $-6$) just parse the class. The mode is updated later. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_[: } { \@@_if_in_class:TF { \@@_compile_class_posix_test:w } { \@@_if_within_catcode:TF { \exp_after:wN \@@_compile_class_catcode:w \int_use:N \l_@@_catcodes_int ; } { \@@_compile_class_normal:w } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_class_normal:w} % In the \enquote{normal} case, we insert \cs{@@_class:NnnnN} % \meta{boolean} in the compiled code. The \meta{boolean} is true for % positive classes, and false for negative classes, characterized by a % leading |^|. The auxiliary \cs{@@_compile_class:TFNN} also % checks for a leading |]| which has a special meaning. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class_normal:w { \@@_compile_class:TFNN { \@@_class:NnnnN \c_true_bool } { \@@_class:NnnnN \c_false_bool } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_class_catcode:w} % This function is called for a left bracket in modes $2$ or $6$ % (catcode test, and catcode test within a class). In mode $2$ the % whole construction needs to be put in a class (like single % character). Then determine if the class is positive or negative, % inserting \cs{@@_item_catcode:nT} or the \texttt{reverse} variant % as appropriate, each with the current catcodes bitmap |#1| as an % argument, and reset the catcodes. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class_catcode:w #1; { \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_mode_int \tl_build_put_right:Nn \l_@@_build_tl { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: } \fi: \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \@@_compile_class:TFNN { \@@_item_catcode:nT {#1} } { \@@_item_catcode_reverse:nT {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro} % {\@@_compile_class:TFNN, \@@_compile_class:NN} % If the first character is |^|, then the class is negative (use % |#2|), otherwise it is positive (use |#1|). If the next character % is a right bracket, then it should be changed to a raw one. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class:TFNN #1#2#3#4 { \l_@@_mode_int = \int_value:w \l_@@_mode_int 3 \exp_stop_f: \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ^ { \tl_build_put_right:Nn \l_@@_build_tl { #2 { \if_false: } \fi: } \@@_compile_class:NN } { \tl_build_put_right:Nn \l_@@_build_tl { #1 { \if_false: } \fi: } \@@_compile_class:NN #3 #4 } } \cs_new_protected:Npn \@@_compile_class:NN #1#2 { \token_if_eq_charcode:NNTF #2 ] { \@@_compile_raw:N #2 } { #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_compile_class_posix_test:w, % \@@_compile_class_posix:NNNNw, % \@@_compile_class_posix_loop:w, % \@@_compile_class_posix_end:w % } % Here we check for a syntax such as |[:alpha:]|. We also detect |[=| % and |[.| which have a meaning in \textsc{posix} regular expressions, % but are not implemented in \pkg{l3regex}. In case we see |[:|, grab % raw characters until hopefully reaching |:]|. If that's missing, or % the \textsc{posix} class is unknown, abort. If all is right, add the % test to the current class, with an extra \cs{@@_item_reverse:n} % for negative classes (we make sure to wrap its argument in braces % otherwise \cs{regex_show:N} would not recognize the regex as valid). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class_posix_test:w #1#2 { \token_if_eq_meaning:NNT \@@_compile_special:N #1 { \str_case:nn { #2 } { : { \@@_compile_class_posix:NNNNw } = { \msg_warning:nne { regex } { posix-unsupported } { = } } . { \msg_warning:nne { regex } { posix-unsupported } { . } } } } \@@_compile_raw:N [ #1 #2 } \cs_new_protected:Npn \@@_compile_class_posix:NNNNw #1#2#3#4#5#6 { \@@_two_if_eq:NNNNTF #5 #6 \@@_compile_special:N ^ { \bool_set_false:N \l_@@_internal_bool \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_class_posix_loop:w } { \bool_set_true:N \l_@@_internal_bool \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_class_posix_loop:w #5 #6 } } \cs_new:Npn \@@_compile_class_posix_loop:w #1#2 { \token_if_eq_meaning:NNTF \@@_compile_raw:N #1 { #2 \@@_compile_class_posix_loop:w } { \if_false: { \fi: } \@@_compile_class_posix_end:w #1 #2 } } \cs_new_protected:Npn \@@_compile_class_posix_end:w #1#2#3#4 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N : { \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ] } { \use_ii:nn } { \cs_if_exist:cTF { @@_posix_ \l_@@_internal_a_tl : } { \@@_compile_one:n { \bool_if:NTF \l_@@_internal_bool \use:n \@@_item_reverse:n { \exp_not:c { @@_posix_ \l_@@_internal_a_tl : } } } } { \msg_warning:nne { regex } { posix-unknown } { \l_@@_internal_a_tl } \@@_compile_abort_tokens:e { [: \bool_if:NF \l_@@_internal_bool { ^ } \l_@@_internal_a_tl :] } } } { \msg_error:nnee { regex } { posix-missing-close } { [: \l_@@_internal_a_tl } { #2 #4 } \@@_compile_abort_tokens:e { [: \l_@@_internal_a_tl } #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \subsubsection{Groups and alternations} % % \begin{macro}{\@@_compile_group_begin:N, \@@_compile_group_end:} % The contents of a regex group are turned into compiled code in % \cs{l_@@_build_tl}, which ends up with items of the form % \cs{@@_branch:n} \Arg{concatenation}. This construction is done % using \cs[no-index]{tl_build_\ldots{}} functions within a \TeX{} group, which automatically % makes sure that options (case-sensitivity and default catcode) are % reset at the end of the group. The argument |#1| is % \cs{@@_group:nnnN} or a variant thereof. A small subtlety to % support |\cL(abc)| as a shorthand for |(\cLa\cLb\cLc)|: exit any % pending catcode test, save the category code at the start of the % group as the default catcode for that group, and make sure that the % catcode is restored to the default outside the group. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_group_begin:N #1 { \tl_build_put_right:Nn \l_@@_build_tl { #1 { \if_false: } \fi: } \@@_mode_quit_c: \group_begin: \tl_build_begin:N \l_@@_build_tl \int_set_eq:NN \l_@@_default_catcodes_int \l_@@_catcodes_int \int_incr:N \l_@@_group_level_int \tl_build_put_right:Nn \l_@@_build_tl { \@@_branch:n { \if_false: } \fi: } } \cs_new_protected:Npn \@@_compile_group_end: { \if_int_compare:w \l_@@_group_level_int > \c_zero_int \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } } \tl_build_end:N \l_@@_build_tl \exp_args:NNNe \group_end: \tl_build_put_right:Nn \l_@@_build_tl { \l_@@_build_tl } \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \exp_after:wN \@@_compile_quantifier:w \else: \msg_warning:nn { regex } { extra-rparen } \exp_after:wN \@@_compile_raw:N \exp_after:wN ) \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_(:} % In a class, parentheses are not special. In a catcode test inside a % class, a left parenthesis gives an error, to catch |[a\cL(bcd)e]|. % Otherwise check for a |?|, denoting special groups, and run the code % for the corresponding special group. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_(: } { \@@_if_in_class:TF { \@@_compile_raw:N ( } { \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_in_class_mode_int \msg_error:nn { regex } { c-lparen-in-class } \exp_after:wN \@@_compile_raw:N \exp_after:wN ( \else: \exp_after:wN \@@_compile_lparen:w \fi: } } \cs_new_protected:Npn \@@_compile_lparen:w #1#2#3#4 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N ? { \cs_if_exist_use:cF { @@_compile_special_group_\token_to_str:N #4 :w } { \msg_warning:nne { regex } { special-group-unknown } { (? #4 } \@@_compile_group_begin:N \@@_group:nnnN \@@_compile_raw:N ? #3 #4 } } { \@@_compile_group_begin:N \@@_group:nnnN #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_|:+ % In a class, the pipe is not special. Otherwise, end the current % branch and open another one. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_|: } { \@@_if_in_class:TF { \@@_compile_raw:N | } { \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } \@@_branch:n { \if_false: } \fi: } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_):} % Within a class, parentheses are not special. Outside, close a group. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_): } { \@@_if_in_class:TF { \@@_compile_raw:N ) } { \@@_compile_group_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_special_group_::w} % \begin{macro}+\@@_compile_special_group_|:w+ % Non-capturing, and resetting groups are easy to take care of during % compilation; for those groups, the harder parts come when building. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_special_group_::w } { \@@_compile_group_begin:N \@@_group_no_capture:nnnN } \cs_new_protected:cpn { @@_compile_special_group_|:w } { \@@_compile_group_begin:N \@@_group_resetting:nnnN } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro} % {\@@_compile_special_group_i:w, \@@_compile_special_group_-:w} % The match can be made case-insensitive by setting the option with % \texttt{(?i)}; the original behaviour is restored by \texttt{(?-i)}. % This is the only supported option. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_special_group_i:w #1#2 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N ) { \cs_set:Npn \@@_item_equal:n { \@@_item_caseless_equal:n } \cs_set:Npn \@@_item_range:nn { \@@_item_caseless_range:nn } } { \msg_warning:nne { regex } { unknown-option } { (?i #2 } \@@_compile_raw:N ( \@@_compile_raw:N ? \@@_compile_raw:N i #1 #2 } } \cs_new_protected:cpn { @@_compile_special_group_-:w } #1#2#3#4 { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_raw:N i { \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ) } { \use_ii:nn } { \cs_set:Npn \@@_item_equal:n { \@@_item_caseful_equal:n } \cs_set:Npn \@@_item_range:nn { \@@_item_caseful_range:nn } } { \msg_warning:nne { regex } { unknown-option } { (?-#2#4 } \@@_compile_raw:N ( \@@_compile_raw:N ? \@@_compile_raw:N - #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \subsubsection{Catcodes and csnames} % % \begin{macro}{\@@_compile_/c:, \@@_compile_c_test:NN} % The |\c| escape sequence can be followed by a capital letter % representing a character category, by a left bracket which starts a % list of categories, or by a brace group holding a regular expression % for a control sequence name. Otherwise, raise an error. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_/c: } { \@@_chk_c_allowed:T { \@@_compile_c_test:NN } } \cs_new_protected:Npn \@@_compile_c_test:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { \int_if_exist:cTF { c_@@_catcode_#2_int } { \int_set_eq:Nc \l_@@_catcodes_int { c_@@_catcode_#2_int } \l_@@_mode_int = \if_case:w \l_@@_mode_int \c_@@_catcode_mode_int \else: \c_@@_catcode_in_class_mode_int \fi: \token_if_eq_charcode:NNT C #2 { \@@_compile_c_C:NN } } } { \cs_if_exist_use:cF { @@_compile_c_#2:w } } { \msg_error:nne { regex } { c-missing-category } {#2} #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_c_C:NN} % If |\cC| is not followed by |.| or |(...)| then complain because % that construction cannot match anything, except in cases like % |\cC[\c{...}]|, where it has no effect. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_c_C:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_special:N { \token_if_eq_charcode:NNTF #2 . { \use_none:n } { \token_if_eq_charcode:NNF #2 ( } % ) } { \use:n } { \msg_error:nnn { regex } { c-C-invalid } {#2} } #1 #2 } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_compile_c_[:w, % \@@_compile_c_lbrack_loop:NN, % \@@_compile_c_lbrack_add:N, % \@@_compile_c_lbrack_end:, % } % When encountering |\c[|, the task is to collect uppercase letters % representing character categories. First check for |^| which negates % the list of category codes. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_c_[:w } #1#2 { \l_@@_mode_int = \if_case:w \l_@@_mode_int \c_@@_catcode_mode_int \else: \c_@@_catcode_in_class_mode_int \fi: \int_zero:N \l_@@_catcodes_int \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N ^ { \bool_set_false:N \l_@@_catcodes_bool \@@_compile_c_lbrack_loop:NN } { \bool_set_true:N \l_@@_catcodes_bool \@@_compile_c_lbrack_loop:NN #1 #2 } } \cs_new_protected:Npn \@@_compile_c_lbrack_loop:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { \int_if_exist:cTF { c_@@_catcode_#2_int } { \exp_args:Nc \@@_compile_c_lbrack_add:N { c_@@_catcode_#2_int } \@@_compile_c_lbrack_loop:NN } } { \token_if_eq_charcode:NNTF #2 ] { \@@_compile_c_lbrack_end: } } { \msg_error:nne { regex } { c-missing-rbrack } {#2} \@@_compile_c_lbrack_end: #1 #2 } } \cs_new_protected:Npn \@@_compile_c_lbrack_add:N #1 { \if_int_odd:w \@@_int_eval:w \l_@@_catcodes_int / #1 \scan_stop: \else: \int_add:Nn \l_@@_catcodes_int {#1} \fi: } \cs_new_protected:Npn \@@_compile_c_lbrack_end: { \if_meaning:w \c_false_bool \l_@@_catcodes_bool \int_set:Nn \l_@@_catcodes_int { \c_@@_all_catcodes_int - \l_@@_catcodes_int } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_c_{:+ % The case of a left brace is easy, based on what we have done so far: % in a group, compile the regular expression, after changing the mode % to forbid nesting |\c|. Additionally, disable submatch tracking % since groups don't escape the scope of |\c{...}|. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_c_ \c_left_brace_str :w } { \@@_compile:w \@@_disable_submatches: \l_@@_mode_int = \if_case:w \l_@@_mode_int \c_@@_cs_mode_int \else: \c_@@_cs_in_class_mode_int \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_{:+ % We forbid unescaped left braces inside a |\c{...}| escape because % they otherwise lead to the confusing question of whether the first % right brace in |\c{{}x}| should end |\c| or whether one should % match braces. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_ \c_left_brace_str : } { \@@_if_in_cs:TF { \msg_error:nnn { regex } { cu-lbrace } { c } } { \exp_after:wN \@@_compile_raw:N \c_left_brace_str } } % \end{macrocode} % \end{macro} % % \begin{variable}{\l_@@_cs_flag} % \begin{macro}+\@@_compile_}:+ % \begin{macro}{\@@_compile_end_cs:} % \begin{macro}[EXP]{\@@_compile_cs_aux:Nn, \@@_compile_cs_aux:NNnnnN} % Non-escaped right braces are only special if they appear when % compiling the regular expression for a csname, but not within a % class: |\c{[{}]}| matches the control sequences |\{| and |\}|. So, % end compiling the inner regex (this closes any dangling class or % group). Then insert the corresponding test in the outer regex. As % an optimization, if the control sequence test simply consists of % several explicit possibilities (branches) then use % \cs{@@_item_exact_cs:n} with an argument consisting of all % possibilities separated by \cs{scan_stop:}. % \begin{macrocode} \flag_new:N \l_@@_cs_flag \cs_new_protected:cpn { @@_compile_ \c_right_brace_str : } { \@@_if_in_cs:TF { \@@_compile_end_cs: } { \exp_after:wN \@@_compile_raw:N \c_right_brace_str } } \cs_new_protected:Npn \@@_compile_end_cs: { \@@_compile_end: \flag_clear:N \l_@@_cs_flag \__kernel_tl_set:Nx \l_@@_internal_a_tl { \exp_after:wN \@@_compile_cs_aux:Nn \l_@@_internal_regex \q_@@_nil \q_@@_nil \q_@@_recursion_stop } \exp_args:Ne \@@_compile_one:n { \flag_if_raised:NTF \l_@@_cs_flag { \@@_item_cs:n { \exp_not:o \l_@@_internal_regex } } { \@@_item_exact_cs:n { \tl_tail:N \l_@@_internal_a_tl } } } } \cs_new:Npn \@@_compile_cs_aux:Nn #1#2 { \cs_if_eq:NNTF #1 \@@_branch:n { \scan_stop: \@@_compile_cs_aux:NNnnnN #2 \q_@@_nil \q_@@_nil \q_@@_nil \q_@@_nil \q_@@_nil \q_@@_nil \q_@@_recursion_stop \@@_compile_cs_aux:Nn } { \@@_quark_if_nil:NF #1 { \flag_ensure_raised:N \l_@@_cs_flag } \@@_use_none_delimit_by_q_recursion_stop:w } } \cs_new:Npn \@@_compile_cs_aux:NNnnnN #1#2#3#4#5#6 { \bool_lazy_all:nTF { { \cs_if_eq_p:NN #1 \@@_class:NnnnN } {#2} { \tl_if_head_eq_meaning_p:nN {#3} \@@_item_caseful_equal:n } { \int_compare_p:nNn { \tl_count:n {#3} } = { 2 } } { \int_compare_p:nNn {#5} = \c_zero_int } } { \prg_replicate:nn {#4} { \char_generate:nn { \use_ii:nn #3 } {12} } \@@_compile_cs_aux:NNnnnN } { \@@_quark_if_nil:NF #1 { \flag_ensure_raised:N \l_@@_cs_flag \@@_use_i_delimit_by_q_recursion_stop:nw } \@@_use_none_delimit_by_q_recursion_stop:w } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{variable} % % \subsubsection{Raw token lists with \cs[no-index]{u}} % % \begin{macro}{\@@_compile_/u:} % The |\u| escape is invalid in classes and directly following a % catcode test. Otherwise test for a following |r| (for |\ur|), and % call an auxiliary responsible for finding the variable name. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_/u: } #1#2 { \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N u #1 #2 } { \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_raw:N r { \@@_compile_u_brace:NNN \@@_compile_ur_end: } { \@@_compile_u_brace:NNN \@@_compile_u_end: #1 #2 } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_u_brace:NNN} % This enforces the presence of a left brace, then starts a loop to % find the variable name. % \begin{macrocode} \cs_new:Npn \@@_compile_u_brace:NNN #1#2#3 { \@@_two_if_eq:NNNNTF #2 #3 \@@_compile_special:N \c_left_brace_str { \tl_set:Nn \l_@@_internal_b_tl {#1} \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_u_loop:NN } { \msg_error:nn { regex } { u-missing-lbrace } \token_if_eq_meaning:NNTF #1 \@@_compile_ur_end: { \@@_compile_raw:N u \@@_compile_raw:N r } { \@@_compile_raw:N u } #2 #3 } } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_compile_u_loop:NN} % We collect the characters for the argument of |\u| within an % \texttt{e}-expanding assignment. In principle we could just wait to % encounter a right brace, but this is unsafe: if the right brace was % missing, then we would reach the end-markers of the regex, and % continue, leading to obscure fatal errors. Instead, we only allow % raw and special characters, and stop when encountering a special % right brace, any escaped character, or the end-marker. % \begin{macrocode} \cs_new:Npn \@@_compile_u_loop:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { #2 \@@_compile_u_loop:NN } { \token_if_eq_meaning:NNTF #1 \@@_compile_special:N { \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #2 { \if_false: { \fi: } \l_@@_internal_b_tl } { \if_charcode:w \c_left_brace_str #2 \msg_expandable_error:nnn { regex } { cu-lbrace } { u } \else: #2 \fi: \@@_compile_u_loop:NN } } { \if_false: { \fi: } \msg_error:nne { regex } { u-missing-rbrace } {#2} \l_@@_internal_b_tl #1 #2 } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_ur_end:, \@@_compile_ur:n} % \begin{macro}[EXP]{\@@_compile_ur_aux:w} % For the |\ur{...}| construction, once we have extracted the % variable's name, we replace all groups by non-capturing groups in % the compiled regex (passed as the % argument of \cs{@@_compile_ur:n}). If that has a single branch % (namely \cs{tl_if_empty:oTF} is false) and there is no quantifier, % then simply insert the contents of this branch (obtained by % \cs{use_ii:nn}, which is expanded later). In all other cases, % insert a non-capturing group and look for quantifiers to determine % the number of repetition etc. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_ur_end: { \group_begin: \cs_set:Npn \@@_group:nnnN { \@@_group_no_capture:nnnN } \cs_set:Npn \@@_group_resetting:nnnN { \@@_group_no_capture:nnnN } \exp_args:NNe \group_end: \@@_compile_ur:n { \use:c { \l_@@_internal_a_tl } } } \cs_new_protected:Npn \@@_compile_ur:n #1 { \tl_if_empty:oTF { \@@_compile_ur_aux:w #1 {} ? ? \q_@@_nil } { \@@_compile_if_quantifier:TFw } { \use_i:nn } { \tl_build_put_right:Nn \l_@@_build_tl { \@@_group_no_capture:nnnN { \if_false: } \fi: #1 } \@@_compile_quantifier:w } { \tl_build_put_right:Nn \l_@@_build_tl { \use_ii:nn #1 } } } \cs_new:Npn \@@_compile_ur_aux:w \@@_branch:n #1#2#3 \q_@@_nil {#2} % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_compile_u_end:, \@@_compile_u_payload:} % Once we have extracted the variable's name, we check for % quantifiers, in which case we set up a non-capturing group with a % single branch. Inside this branch (we omit it and the group if % there is no quantifier), \cs{@@_compile_u_payload:} puts % the right tests corresponding to the contents of the variable, which % we store in \cs{l_@@_internal_a_tl}. The behaviour of |\u| % then depends on whether we are within a |\c{...}| escape (in this % case, the variable is turned to a string), or not. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_end: { \@@_compile_if_quantifier:TFw { \tl_build_put_right:Nn \l_@@_build_tl { \@@_group_no_capture:nnnN { \if_false: } \fi: \@@_branch:n { \if_false: } \fi: } \@@_compile_u_payload: \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } } \@@_compile_quantifier:w } { \@@_compile_u_payload: } } \cs_new_protected:Npn \@@_compile_u_payload: { \tl_set:Nv \l_@@_internal_a_tl { \l_@@_internal_a_tl } \if_int_compare:w \l_@@_mode_int = \c_@@_outer_mode_int \@@_compile_u_not_cs: \else: \@@_compile_u_in_cs: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_u_in_cs:} % When |\u| appears within a control sequence, we convert the variable % to a string with escaped spaces. Then for each character insert a % class matching exactly that character, once. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_in_cs: { \__kernel_tl_gset:Nx \g_@@_internal_tl { \exp_args:No \__kernel_str_to_other_fast:n { \l_@@_internal_a_tl } } \tl_build_put_right:Ne \l_@@_build_tl { \tl_map_function:NN \g_@@_internal_tl \@@_compile_u_in_cs_aux:n } } \cs_new:Npn \@@_compile_u_in_cs_aux:n #1 { \@@_class:NnnnN \c_true_bool { \@@_item_caseful_equal:n { \int_value:w `#1 } } { 1 } { 0 } \c_false_bool } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_compile_u_not_cs:} % In mode $0$, the |\u| escape adds one state to the NFA for each % token in \cs{l_@@_internal_a_tl}. If a given \meta{token} is a % control sequence, then insert a string comparison test, otherwise, % \cs{@@_item_exact:nn} which compares catcode and character code. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_not_cs: { \tl_analysis_map_inline:Nn \l_@@_internal_a_tl { \tl_build_put_right:Ne \l_@@_build_tl { \@@_class:NnnnN \c_true_bool { \if_int_compare:w "##3 = \c_zero_int \@@_item_exact_cs:n { \exp_after:wN \cs_to_str:N ##1 } \else: \@@_item_exact:nn { \int_value:w "##3 } { ##2 } \fi: } { 1 } { 0 } \c_false_bool } } } % \end{macrocode} % \end{macro} % % \subsubsection{Other} % % \begin{macro}{\@@_compile_/K:} % The |\K| control sequence is currently the only \enquote{command}, % which performs some action, rather than matching something. It is % allowed in the same contexts as |\b|. At the compilation stage, we % leave it as a single control sequence, defined later. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_/K: } { \int_compare:nNnTF \l_@@_mode_int = \c_@@_outer_mode_int { \tl_build_put_right:Nn \l_@@_build_tl { \@@_command_K: } } { \@@_compile_raw_error:N K } } % \end{macrocode} % \end{macro} % % \subsubsection{Showing regexes} % % \begin{macro}[rEXP] % { % \@@_clean_bool:n, \@@_clean_int:n, \@@_clean_int_aux:N, % \@@_clean_regex:n, \@@_clean_regex_loop:w, \@@_clean_branch:n, % \@@_clean_branch_loop:n, \@@_clean_assertion:Nn, % \@@_clean_class:NnnnN, \@@_clean_group:nnnN, \@@_clean_class:n, % \@@_clean_class_loop:nnn, \@@_clean_exact_cs:n, % \@@_clean_exact_cs:w % } % Before showing a regex we check that it is \enquote{clean} in the % sense that it has the correct internal structure. We do this (in % the implementation of \cs{regex_show:N} and \cs{regex_log:N}) by % comparing it with a cleaned-up version of the same regex. Along the % way we also need similar functions for other types: all % \cs[no-index]{@@_clean_\meta{type}:n} functions produce valid % \meta{type} tokens (bool, explicit integer, etc.\@) from arbitrary % input, and the output coincides with the input if that was valid. % \begin{macrocode} \cs_new:Npn \@@_clean_bool:n #1 { \tl_if_single:nTF {#1} { \bool_if:NTF #1 \c_true_bool \c_false_bool } { \c_true_bool } } \cs_new:Npn \@@_clean_int:n #1 { \tl_if_head_eq_meaning:nNTF {#1} - { - \exp_args:No \@@_clean_int:n { \use_none:n #1 } } { \int_eval:n { 0 \str_map_function:nN {#1} \@@_clean_int_aux:N } } } \cs_new:Npn \@@_clean_int_aux:N #1 { \if_int_compare:w \c_one_int < 1 #1 ~ #1 \else: \str_map_break:n \fi: } \cs_new:Npn \@@_clean_regex:n #1 { \@@_clean_regex_loop:w #1 \@@_branch:n { \q_recursion_tail } \q_recursion_stop } \cs_new:Npn \@@_clean_regex_loop:w #1 \@@_branch:n #2 { \quark_if_recursion_tail_stop:n {#2} \@@_branch:n { \@@_clean_branch:n {#2} } \@@_clean_regex_loop:w } \cs_new:Npn \@@_clean_branch:n #1 { \@@_clean_branch_loop:n #1 ? ? ? ? ? ? \prg_break_point: } \cs_new:Npn \@@_clean_branch_loop:n #1 { \tl_if_single:nF {#1} \prg_break: \token_case_meaning:NnF #1 { \@@_command_K: { #1 \@@_clean_branch_loop:n } \@@_assertion:Nn { #1 \@@_clean_assertion:Nn } \@@_class:NnnnN { #1 \@@_clean_class:NnnnN } \@@_group:nnnN { #1 \@@_clean_group:nnnN } \@@_group_no_capture:nnnN { #1 \@@_clean_group:nnnN } \@@_group_resetting:nnnN { #1 \@@_clean_group:nnnN } } \prg_break: } \cs_new:Npn \@@_clean_assertion:Nn #1#2 { \@@_clean_bool:n {#1} \tl_if_single:nF {#2} { { \@@_A_test: } \prg_break: } \token_case_meaning:NnTF #2 { \@@_A_test: { } \@@_G_test: { } \@@_Z_test: { } \@@_b_test: { } } { {#2} } { { \@@_A_test: } \prg_break: } \@@_clean_branch_loop:n } \cs_new:Npn \@@_clean_class:NnnnN #1#2#3#4#5 { \@@_clean_bool:n {#1} { \@@_clean_class:n {#2} } { \int_max:nn \c_zero_int { \@@_clean_int:n {#3} } } { \int_max:nn { -\c_one_int } { \@@_clean_int:n {#4} } } \@@_clean_bool:n {#5} \@@_clean_branch_loop:n } \cs_new:Npn \@@_clean_group:nnnN #1#2#3#4 { { \@@_clean_regex:n {#1} } { \int_max:nn \c_zero_int { \@@_clean_int:n {#2} } } { \int_max:nn { -\c_one_int } { \@@_clean_int:n {#3} } } \@@_clean_bool:n {#4} \@@_clean_branch_loop:n } \cs_new:Npn \@@_clean_class:n #1 { \@@_clean_class_loop:nnn #1 ????? \prg_break_point: } % \end{macrocode} % When cleaning a class there are many cases, including a dozen or so % like \cs{@@_prop_d:} or \cs{@@_posix_alpha:}. To avoid listing all of % them we allow any command that starts with the % $13$ characters |__regex_prop_| or |__regex_posix| (handily these have % the same length, except for the trailing underscore). % \begin{macrocode} \cs_new:Npn \@@_clean_class_loop:nnn #1#2#3 { \tl_if_single:nF {#1} \prg_break: \token_case_meaning:NnTF #1 { \@@_item_cs:n { #1 { \@@_clean_regex:n {#2} } } \@@_item_exact_cs:n { #1 { \@@_clean_exact_cs:n {#2} } } \@@_item_caseful_equal:n { #1 { \@@_clean_int:n {#2} } } \@@_item_caseless_equal:n { #1 { \@@_clean_int:n {#2} } } \@@_item_reverse:n { #1 { \@@_clean_class:n {#2} } } } { \@@_clean_class_loop:nnn {#3} } { \token_case_meaning:NnTF #1 { \@@_item_caseful_range:nn { } \@@_item_caseless_range:nn { } \@@_item_exact:nn { } } { #1 { \@@_clean_int:n {#2} } { \@@_clean_int:n {#3} } \@@_clean_class_loop:nnn } { \token_case_meaning:NnTF #1 { \@@_item_catcode:nT { } \@@_item_catcode_reverse:nT { } } { #1 { \@@_clean_int:n {#2} } { \@@_clean_class:n {#3} } \@@_clean_class_loop:nnn } { \exp_args:Ne \str_case:nnTF { \exp_args:Ne \str_range:nnn { \cs_to_str:N #1 } \c_one_int { 13 } } { { @@_prop_ } { } { @@_posix } { } } { #1 \@@_clean_class_loop:nnn {#2} {#3} } \prg_break: } } } } \cs_new:Npn \@@_clean_exact_cs:n #1 { \exp_last_unbraced:Nf \use_none:n { \@@_clean_exact_cs:w #1 \scan_stop: \q_recursion_tail \scan_stop: \q_recursion_stop } } \cs_new:Npn \@@_clean_exact_cs:w #1 \scan_stop: { \quark_if_recursion_tail_stop:n {#1} \scan_stop: \tl_to_str:n {#1} \@@_clean_exact_cs:w } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show:N} % Within a group and within \cs{tl_build_begin:N} \ldots{} \cs{tl_build_end:N} we % redefine all the function that can appear in a compiled regex, then % run the regex. The result stored in \cs{l_@@_internal_a_tl} is then % meant to be shown. % \begin{macrocode} \cs_new_protected:Npn \@@_show:N #1 { \group_begin: \tl_build_begin:N \l_@@_build_tl \cs_set_protected:Npn \@@_branch:n { \seq_pop_right:NN \l_@@_show_prefix_seq \l_@@_internal_a_tl \@@_show_one:n { +-branch } \seq_put_right:No \l_@@_show_prefix_seq \l_@@_internal_a_tl \use:n } \cs_set_protected:Npn \@@_group:nnnN { \@@_show_group_aux:nnnnN { } } \cs_set_protected:Npn \@@_group_no_capture:nnnN { \@@_show_group_aux:nnnnN { ~(no~capture) } } \cs_set_protected:Npn \@@_group_resetting:nnnN { \@@_show_group_aux:nnnnN { ~(resetting) } } \cs_set_eq:NN \@@_class:NnnnN \@@_show_class:NnnnN \cs_set_protected:Npn \@@_command_K: { \@@_show_one:n { reset~match~start~(\iow_char:N\\K) } } \cs_set_protected:Npn \@@_assertion:Nn ##1##2 { \@@_show_one:n { \bool_if:NF ##1 { negative~ } assertion:~##2 } } \cs_set:Npn \@@_b_test: { word~boundary } \cs_set:Npn \@@_Z_test: { anchor~at~end~(\iow_char:N\\Z) } \cs_set:Npn \@@_A_test: { anchor~at~start~(\iow_char:N\\A) } \cs_set:Npn \@@_G_test: { anchor~at~start~of~match~(\iow_char:N\\G) } \cs_set_protected:Npn \@@_item_caseful_equal:n ##1 { \@@_show_one:n { char~code~\@@_show_char:n{##1} } } \cs_set_protected:Npn \@@_item_caseful_range:nn ##1##2 { \@@_show_one:n { range~[\@@_show_char:n{##1}, \@@_show_char:n{##2}] } } \cs_set_protected:Npn \@@_item_caseless_equal:n ##1 { \@@_show_one:n { char~code~\@@_show_char:n{##1}~(caseless) } } \cs_set_protected:Npn \@@_item_caseless_range:nn ##1##2 { \@@_show_one:n { Range~[\@@_show_char:n{##1}, \@@_show_char:n{##2}]~(caseless) } } \cs_set_protected:Npn \@@_item_catcode:nT { \@@_show_item_catcode:NnT \c_true_bool } \cs_set_protected:Npn \@@_item_catcode_reverse:nT { \@@_show_item_catcode:NnT \c_false_bool } \cs_set_protected:Npn \@@_item_reverse:n { \@@_show_scope:nn { Reversed~match } } \cs_set_protected:Npn \@@_item_exact:nn ##1##2 { \@@_show_one:n { char~\@@_show_char:n{##2},~catcode~##1 } } \cs_set_eq:NN \@@_item_exact_cs:n \@@_show_item_exact_cs:n \cs_set_protected:Npn \@@_item_cs:n { \@@_show_scope:nn { control~sequence } } \cs_set:cpn { @@_prop_.: } { \@@_show_one:n { any~token } } \seq_clear:N \l_@@_show_prefix_seq \@@_show_push:n { ~ } \cs_if_exist_use:N #1 \tl_build_end:N \l_@@_build_tl \exp_args:NNNo \group_end: \tl_set:Nn \l_@@_internal_a_tl { \l_@@_build_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_show_char:n} % Show a single character, together with its ascii representation if available. % This could be extended to beyond ascii. It is not ideal for parentheses themselves. % \begin{macrocode} \cs_new:Npn \@@_show_char:n #1 { \int_eval:n {#1} \int_compare:nT { 32 <= #1 <= 126 } { ~ ( \char_generate:nn {#1} {12} ) } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show_one:n} % Every part of the final message go through this function, which adds % one line to the output, with the appropriate prefix. % \begin{macrocode} \cs_new_protected:Npn \@@_show_one:n #1 { \int_incr:N \l_@@_show_lines_int \tl_build_put_right:Ne \l_@@_build_tl { \exp_not:N \iow_newline: \seq_map_function:NN \l_@@_show_prefix_seq \use:n #1 } } % \end{macrocode} % \end{macro} % % \begin{macro} % {\@@_show_push:n, \@@_show_pop:, \@@_show_scope:nn} % Enter and exit levels of nesting. The \texttt{scope} function prints % its first argument as an \enquote{introduction}, then performs its % second argument in a deeper level of nesting. % \begin{macrocode} \cs_new_protected:Npn \@@_show_push:n #1 { \seq_put_right:Ne \l_@@_show_prefix_seq { #1 ~ } } \cs_new_protected:Npn \@@_show_pop: { \seq_pop_right:NN \l_@@_show_prefix_seq \l_@@_internal_a_tl } \cs_new_protected:Npn \@@_show_scope:nn #1#2 { \@@_show_one:n {#1} \@@_show_push:n { ~ } #2 \@@_show_pop: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show_group_aux:nnnnN} % We display all groups in the same way, simply adding a message, % \texttt{(no capture)} or \texttt{(resetting)}, to special groups. % The odd \cs{use_ii:nn} avoids printing a spurious \texttt{+-branch} % for the first branch. % \begin{macrocode} \cs_new_protected:Npn \@@_show_group_aux:nnnnN #1#2#3#4#5 { \@@_show_one:n { ,-group~begin #1 } \@@_show_push:n { | } \use_ii:nn #2 \@@_show_pop: \@@_show_one:n { `-group~end \@@_msg_repeated:nnN {#3} {#4} #5 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show_class:NnnnN} % I'm entirely unhappy about this function: I couldn't find a way to % test if a class is a single test. Instead, collect the % representation of the tests in the class. If that had more than one % line, write \texttt{Match} or \texttt{Don't match} on its own line, % with the repeating information if any. Then the various tests on % lines of their own, and finally a line. Otherwise, we need to % evaluate the representation of the tests again (since the prefix is % incorrect). That's clunky, but not too expensive, since it's only % one test. % \begin{macrocode} \cs_new:Npn \@@_show_class:NnnnN #1#2#3#4#5 { \group_begin: \tl_build_begin:N \l_@@_build_tl \int_zero:N \l_@@_show_lines_int \@@_show_push:n {~} #2 \int_compare:nTF { \l_@@_show_lines_int = \c_zero_int } { \group_end: \@@_show_one:n { \bool_if:NTF #1 { Fail } { Pass } } } { \bool_if:nTF { #1 && \int_compare_p:n { \l_@@_show_lines_int = \c_one_int } } { \group_end: #2 \tl_build_put_right:Nn \l_@@_build_tl { \@@_msg_repeated:nnN {#3} {#4} #5 } } { \tl_build_end:N \l_@@_build_tl \exp_args:NNNo \group_end: \tl_set:Nn \l_@@_internal_a_tl \l_@@_build_tl \@@_show_one:n { \bool_if:NTF #1 { Match } { Don't~match } \@@_msg_repeated:nnN {#3} {#4} #5 } \tl_build_put_right:Ne \l_@@_build_tl { \exp_not:o \l_@@_internal_a_tl } } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show_item_catcode:NnT} % Produce a sequence of categories which the catcode bitmap |#2| % contains, and show it, indenting the tests on which this catcode % constraint applies. % \begin{macrocode} \cs_new_protected:Npn \@@_show_item_catcode:NnT #1#2 { \seq_set_split:Nnn \l_@@_internal_seq { } { CBEMTPUDSLOA } \seq_set_filter:NNn \l_@@_internal_seq \l_@@_internal_seq { \int_if_odd_p:n { #2 / \int_use:c { c_@@_catcode_##1_int } } } \@@_show_scope:nn { categories~ \seq_map_function:NN \l_@@_internal_seq \use:n , ~ \bool_if:NF #1 { negative~ } class } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_show_item_exact_cs:n} % \begin{macrocode} \cs_new_protected:Npn \@@_show_item_exact_cs:n #1 { \seq_set_split:Nnn \l_@@_internal_seq { \scan_stop: } {#1} \seq_set_map_e:NNn \l_@@_internal_seq \l_@@_internal_seq { \iow_char:N\\##1 } \@@_show_one:n { control~sequence~ \seq_use:Nn \l_@@_internal_seq { ~or~ } } } % \end{macrocode} % \end{macro} % % \subsection{Building} % % \subsubsection{Variables used while building} % % \begin{variable}{\l_@@_min_state_int, \l_@@_max_state_int} % The last state that was allocated is % $\cs{l_@@_max_state_int}-1$, so that \cs{l_@@_max_state_int} always % points to a free state. The \texttt{min_state} variable is % $1$ to begin with, but gets shifted in nested calls to the matching % code, namely in |\c{...}| constructions. % \begin{macrocode} \int_new:N \l_@@_min_state_int \int_set:Nn \l_@@_min_state_int { 1 } \int_new:N \l_@@_max_state_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_left_state_int, \l_@@_right_state_int} % \begin{variable}{\l_@@_left_state_seq, \l_@@_right_state_seq} % Alternatives are implemented by branching from a \texttt{left} state % into the various choices, then merging those into a \texttt{right} % state. We store information about those states in two sequences. % Those states are also used to implement group quantifiers. Most % often, the left and right pointers only differ by~$1$. % \begin{macrocode} \int_new:N \l_@@_left_state_int \int_new:N \l_@@_right_state_int \seq_new:N \l_@@_left_state_seq \seq_new:N \l_@@_right_state_seq % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_capturing_group_int} % \cs{l_@@_capturing_group_int} is the next \textsc{id} number to % be assigned to a capturing group. This starts % at $0$ for the group enclosing the full regular expression, and % groups are counted in the order of their left parenthesis, except % when encountering \texttt{resetting} groups. % \begin{macrocode} \int_new:N \l_@@_capturing_group_int % \end{macrocode} % \end{variable} % % \subsubsection{Framework} % % This phase is about going from a compiled regex to an \textsc{nfa}. % Each state of the \textsc{nfa} is stored in a \tn{toks}. The % operations which can appear in the \tn{toks} are % \begin{itemize} % \item \cs{@@_action_start_wildcard:N} \meta{boolean} inserted at the % start of the regular expression, where a \texttt{true} % \meta{boolean} makes it unanchored. % \item \cs{@@_action_success:} marks the exit state of the % \textsc{nfa}. % \item \cs{@@_action_cost:n} \Arg{shift} is a transition from the % current \meta{state} to $\meta{state}+\meta{shift}$, which % consumes the current character: the target state is saved and will % be considered again when matching at the next position. % \item \cs{@@_action_free:n} \Arg{shift}, and % \cs{@@_action_free_group:n} \Arg{shift} are free transitions, % which immediately perform the actions for the state % $\meta{state}+\meta{shift}$ of the \textsc{nfa}. They differ in % how they detect and avoid infinite loops. For now, we just need to % know that the \texttt{group} variant must be used for transitions % back to the start of a group. % \item \cs{@@_action_submatch:nN} \Arg{group} \meta{key} where the % \meta{key} is |<| or |>| for the beginning or end of group % numbered \meta{group}. This causes the current position in the % query to be stored as the \meta{key} submatch boundary. % \item One of these actions, within a conditional. % \end{itemize} % % We strive to preserve the following properties while building. % \begin{itemize} % \item The current capturing group is % $\text{\texttt{capturing_group}}-1$, and if a group opened now % it would be labelled \texttt{capturing_group}. % \item The last allocated state is $\text{\texttt{max_state}}-1$, so % \texttt{max_state} is a free state. % \item The \texttt{left_state} points to a state to the left of the % current group or of the last class. % \item The \texttt{right_state} points to a newly created, % empty state, with some transitions leading to it. % \item The \texttt{left/right} sequences hold a list of the % corresponding end-points of nested groups. % \end{itemize} % % \begin{macro}{\@@_build:n, \@@_build_aux:Nn, \@@_build:N, \@@_build_aux:NN} % The \texttt{n}-type function first compiles its argument. Reset some % variables. Allocate two states, and put a wildcard in state $0$ % (transitions to state $1$ and $0$ state). Then build the regex % within a (capturing) group numbered $0$ (current % value of \texttt{capturing_group}). Finally, if the match reaches the % last state, it is successful. A \texttt{false} boolean for argument % |#1| for the auxiliaries will suppress the wildcard and make the % match anchored: used for \cs{peek_regex:nTF} and similar. % \begin{macrocode} \cs_new_protected:Npn \@@_build:n { \@@_build_aux:Nn \c_true_bool } \cs_new_protected:Npn \@@_build:N { \@@_build_aux:NN \c_true_bool } \cs_new_protected:Npn \@@_build_aux:Nn #1#2 { \@@_compile:n {#2} \@@_build_aux:NN #1 \l_@@_internal_regex } \cs_new_protected:Npn \@@_build_aux:NN #1#2 { \@@_standard_escapechar: \int_zero:N \l_@@_capturing_group_int \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int \@@_build_new_state: \@@_build_new_state: \@@_toks_put_right:Nn \l_@@_left_state_int { \@@_action_start_wildcard:N #1 } \@@_group:nnnN {#2} { 1 } { 0 } \c_false_bool \@@_toks_put_right:Nn \l_@@_right_state_int { \@@_action_success: } } % \end{macrocode} % \end{macro} % % \begin{variable}{\g_@@_case_int} % Case number that was successfully matched in % \cs{regex_match_case:nn} and related functions. % \begin{macrocode} \int_new:N \g_@@_case_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_case_max_group_int} % The largest group number appearing in any of the \meta{regex} in the % argument of \cs{regex_match_case:nn} and related functions. % \begin{macrocode} \int_new:N \l_@@_case_max_group_int % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_case_build:n, \@@_case_build:e, \@@_case_build_aux:Nn, \@@_case_build_loop:n} % See \cs{@@_build:n}, but with a loop. % \begin{macrocode} \cs_new_protected:Npn \@@_case_build:n #1 { \@@_case_build_aux:Nn \c_true_bool {#1} \int_gzero:N \g_@@_case_int } \cs_generate_variant:Nn \@@_case_build:n { e } \cs_new_protected:Npn \@@_case_build_aux:Nn #1#2 { \@@_standard_escapechar: \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int \@@_build_new_state: \@@_build_new_state: \@@_toks_put_right:Nn \l_@@_left_state_int { \@@_action_start_wildcard:N #1 } % \@@_build_new_state: \@@_toks_put_left:Ne \l_@@_left_state_int { \@@_action_submatch:nN \c_zero_int < } \@@_push_lr_states: \int_zero:N \l_@@_case_max_group_int \int_gzero:N \g_@@_case_int \tl_map_inline:nn {#2} { \int_gincr:N \g_@@_case_int \@@_case_build_loop:n {##1} } \int_set_eq:NN \l_@@_capturing_group_int \l_@@_case_max_group_int \@@_pop_lr_states: } \cs_new_protected:Npn \@@_case_build_loop:n #1 { \int_set_eq:NN \l_@@_capturing_group_int \c_one_int \@@_compile_use:n {#1} \int_set:Nn \l_@@_case_max_group_int { \int_max:nn \l_@@_case_max_group_int \l_@@_capturing_group_int } \seq_pop:NN \l_@@_right_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_right_state_int \l_@@_internal_a_tl \@@_toks_put_left:Ne \l_@@_right_state_int { \@@_action_submatch:nN \c_zero_int > \int_gset:Nn \g_@@_case_int { \int_use:N \g_@@_case_int } \@@_action_success: } \@@_toks_clear:N \l_@@_max_state_int \seq_push:No \l_@@_right_state_seq { \int_use:N \l_@@_max_state_int } \int_incr:N \l_@@_max_state_int } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_build_for_cs:n} % The matching code relies on some global intarray variables, but only % uses a range of their entries. Specifically, % \begin{itemize} % \item \cs{g_@@_state_active_intarray} from \cs{l_@@_min_state_int} % to $\cs{l_@@_max_state_int}-1$; % \end{itemize} % Here, in this nested call to the % matching code, we need the new versions of this range to involve % completely new entries of the intarray variables, so we begin by % setting (the new) \cs{l_@@_min_state_int} to (the old) % \cs{l_@@_max_state_int} to use higher entries. % % When using a regex to match a cs, we don't insert a wildcard, we % anchor at the end, and since we ignore submatches, there is no need % to surround the expression with a group. However, for branches to % work properly at the outer level, we need to put the appropriate % \texttt{left} and \texttt{right} states in their sequence. % \begin{macrocode} \cs_new_protected:Npn \@@_build_for_cs:n #1 { \int_set_eq:NN \l_@@_min_state_int \l_@@_max_state_int \@@_build_new_state: \@@_build_new_state: \@@_push_lr_states: #1 \@@_pop_lr_states: \@@_toks_put_right:Nn \l_@@_right_state_int { \if_int_compare:w -2 = \l_@@_curr_char_int \exp_after:wN \@@_action_success: \fi: } } % \end{macrocode} % \end{macro} % % \subsubsection{Helpers for building an \textsc{nfa}} % % \begin{macro}{\@@_push_lr_states:, \@@_pop_lr_states:} % When building the regular expression, we keep track of pointers to % the left-end and right-end of each group without help from \TeX{}'s % grouping. % \begin{macrocode} \cs_new_protected:Npn \@@_push_lr_states: { \seq_push:No \l_@@_left_state_seq { \int_use:N \l_@@_left_state_int } \seq_push:No \l_@@_right_state_seq { \int_use:N \l_@@_right_state_int } } \cs_new_protected:Npn \@@_pop_lr_states: { \seq_pop:NN \l_@@_left_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_left_state_int \l_@@_internal_a_tl \seq_pop:NN \l_@@_right_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_right_state_int \l_@@_internal_a_tl } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_build_transition_left:NNN, % \@@_build_transition_right:nNn % } % Add a transition from |#2| to |#3| using the function |#1|. The % \texttt{left} function is used for higher priority transitions, and % the \texttt{right} function for lower priority transitions (which % should be performed later). The signatures differ to reflect the % differing usage later on. Both functions could be optimized. % \begin{macrocode} \cs_new_protected:Npn \@@_build_transition_left:NNN #1#2#3 { \@@_toks_put_left:Ne #2 { #1 { \tex_the:D \@@_int_eval:w #3 - #2 } } } \cs_new_protected:Npn \@@_build_transition_right:nNn #1#2#3 { \@@_toks_put_right:Ne #2 { #1 { \tex_the:D \@@_int_eval:w #3 - #2 } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_build_new_state:} % Add a new empty state to the \textsc{nfa}. Then update the % \texttt{left}, \texttt{right}, and \texttt{max} states, so that the % \texttt{right} state is the new empty state, and the \texttt{left} % state points to the previously \enquote{current} state. % \begin{macrocode} \cs_new_protected:Npn \@@_build_new_state: { \@@_toks_clear:N \l_@@_max_state_int \int_set_eq:NN \l_@@_left_state_int \l_@@_right_state_int \int_set_eq:NN \l_@@_right_state_int \l_@@_max_state_int \int_incr:N \l_@@_max_state_int } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_build_transitions_laziness:NNNNN} % This function creates a new state, and puts two transitions starting % from the old current state. The order of the transitions is % controlled by |#1|, true for lazy quantifiers, and false for greedy % quantifiers. % \begin{macrocode} \cs_new_protected:Npn \@@_build_transitions_laziness:NNNNN #1#2#3#4#5 { \@@_build_new_state: \@@_toks_put_right:Ne \l_@@_left_state_int { \if_meaning:w \c_true_bool #1 #2 { \tex_the:D \@@_int_eval:w #3 - \l_@@_left_state_int } #4 { \tex_the:D \@@_int_eval:w #5 - \l_@@_left_state_int } \else: #4 { \tex_the:D \@@_int_eval:w #5 - \l_@@_left_state_int } #2 { \tex_the:D \@@_int_eval:w #3 - \l_@@_left_state_int } \fi: } } % \end{macrocode} % \end{macro} % % \subsubsection{Building classes} % % \begin{macro}{\@@_class:NnnnN} % \begin{macro}[rEXP]{\@@_tests_action_cost:n} % The arguments are: \meta{boolean} \Arg{tests} \Arg{min} \Arg{more} % \meta{laziness}. First store the tests with a trailing % \cs{@@_action_cost:n}, in the true branch of % \cs{@@_break_point:TF} for positive classes, or the false branch % for negative classes. The integer \meta{more} is $0$ for fixed % repetitions, $-1$ for unbounded repetitions, and % $\meta{max}-\meta{min}$ for a range of repetitions. % \begin{macrocode} \cs_new_protected:Npn \@@_class:NnnnN #1#2#3#4#5 { \cs_set:Npe \@@_tests_action_cost:n ##1 { \exp_not:n { \exp_not:n {#2} } \bool_if:NTF #1 { \@@_break_point:TF { \@@_action_cost:n {##1} } { } } { \@@_break_point:TF { } { \@@_action_cost:n {##1} } } } \if_case:w - #4 \exp_stop_f: \@@_class_repeat:n {#3} \or: \@@_class_repeat:nN {#3} #5 \else: \@@_class_repeat:nnN {#3} {#4} #5 \fi: } \cs_new:Npn \@@_tests_action_cost:n { \@@_action_cost:n } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_class_repeat:n} % This is used for a fixed number of repetitions. Build one state for % each repetition, with a transition controlled by the tests that we % have collected. That works just fine for |#1|${}=0$ repetitions: % nothing is built. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:n #1 { \prg_replicate:nn {#1} { \@@_build_new_state: \@@_build_transition_right:nNn \@@_tests_action_cost:n \l_@@_left_state_int \l_@@_right_state_int } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_class_repeat:nN} % This implements unbounded repetitions of a single class (\emph{e.g.} % the |*| and |+| quantifiers). If the minimum number |#1| of % repetitions is $0$, then build a transition from the current state % to itself governed by the tests, and a free transition to a new % state (hence skipping the tests). Otherwise, call % \cs{@@_class_repeat:n} for the code to match |#1| repetitions, % and add free transitions from the last state to the previous one, % and to a new one. In both cases, the order of transitions is % controlled by the laziness boolean |#2|. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:nN #1#2 { \if_int_compare:w #1 = \c_zero_int \@@_build_transitions_laziness:NNNNN #2 \@@_action_free:n \l_@@_right_state_int \@@_tests_action_cost:n \l_@@_left_state_int \else: \@@_class_repeat:n {#1} \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int \@@_build_transitions_laziness:NNNNN #2 \@@_action_free:n \l_@@_right_state_int \@@_action_free:n \l_@@_internal_a_int \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_class_repeat:nnN} % We want to build the code to match from |#1| to $|#1|+|#2|$ % repetitions. Match |#1| repetitions (can be $0$). Compute the final % state of the next construction as \texttt{a}. Build $|#2|>0$ states, % each with a transition to the next state governed by the tests, and % a transition to the final state \texttt{a}. The computation of % \texttt{a} is safe because states are allocated in order, starting % from \texttt{max_state}. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:nnN #1#2#3 { \@@_class_repeat:n {#1} \int_set:Nn \l_@@_internal_a_int { \l_@@_max_state_int + #2 - \c_one_int } \prg_replicate:nn { #2 } { \@@_build_transitions_laziness:NNNNN #3 \@@_action_free:n \l_@@_internal_a_int \@@_tests_action_cost:n \l_@@_right_state_int } } % \end{macrocode} % \end{macro} % % \subsubsection{Building groups} % % \begin{macro}{\@@_group_aux:nnnnN} % Arguments: \Arg{label} \Arg{contents} \Arg{min} \Arg{more} % \meta{laziness}. If \meta{min} is $0$, we need to add a state before % building the group, so that the thread which skips the group does % not also set the start-point of the submatch. After adding one more % state, the \texttt{left_state} is the left end of the group, from % which all branches stem, and the \texttt{right_state} is the % right end of the group, and all branches end their course in that % state. We store those two integers to be queried for each branch, we % build the \textsc{nfa} states for the contents |#2| of the group, % and we forget about the two integers. Once this is done, perform the % repetition: either exactly |#3| times, or |#3| or more times, or % between |#3| and $|#3|+|#4|$ times, with laziness |#5|. The % \meta{label} |#1| is used for submatch tracking. Each of the three % auxiliaries expects \texttt{left_state} and \texttt{right_state} to % be set properly. % \begin{macrocode} \cs_new_protected:Npn \@@_group_aux:nnnnN #1#2#3#4#5 { \if_int_compare:w #3 = \c_zero_int \@@_build_new_state: \@@_build_transition_right:nNn \@@_action_free_group:n \l_@@_left_state_int \l_@@_right_state_int \fi: \@@_build_new_state: \@@_push_lr_states: #2 \@@_pop_lr_states: \if_case:w - #4 \exp_stop_f: \@@_group_repeat:nn {#1} {#3} \or: \@@_group_repeat:nnN {#1} {#3} #5 \else: \@@_group_repeat:nnnN {#1} {#3} {#4} #5 \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group:nnnN, \@@_group_no_capture:nnnN} % Hand to \cs{@@_group_aux:nnnnnN} the label of that group % (expanded), and the group itself, with some extra commands to % perform. % \begin{macrocode} \cs_new_protected:Npn \@@_group:nnnN #1 { \exp_args:No \@@_group_aux:nnnnN { \int_use:N \l_@@_capturing_group_int } { \int_incr:N \l_@@_capturing_group_int #1 } } \cs_new_protected:Npn \@@_group_no_capture:nnnN { \@@_group_aux:nnnnN { -1 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_resetting:nnnN} % \begin{macro}{\@@_group_resetting_loop:nnNn} % Again, hand the label $-1$ to \cs{@@_group_aux:nnnnN}, but this % time we work a little bit harder to keep track of the maximum group % label at the end of any branch, and to reset the group number at % each branch. This relies on the fact that a compiled regex always is % a sequence of items of the form \cs{@@_branch:n} \Arg{branch}. % \begin{macrocode} \cs_new_protected:Npn \@@_group_resetting:nnnN #1 { \@@_group_aux:nnnnN { -1 } { \exp_args:Noo \@@_group_resetting_loop:nnNn { \int_use:N \l_@@_capturing_group_int } { \int_use:N \l_@@_capturing_group_int } #1 { ?? \prg_break:n } { } \prg_break_point: } } \cs_new_protected:Npn \@@_group_resetting_loop:nnNn #1#2#3#4 { \use_none:nn #3 { \int_set:Nn \l_@@_capturing_group_int {#1} } \int_set:Nn \l_@@_capturing_group_int {#2} #3 {#4} \exp_args:Ne \@@_group_resetting_loop:nnNn { \int_max:nn {#1} \l_@@_capturing_group_int } {#2} } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_branch:n} % Add a free transition from the left state of the current group to a % brand new state, starting point of this branch. Once the branch is % built, add a transition from its last state to the right state of % the group. The left and right states of the group are extracted from % the relevant sequences. % \begin{macrocode} \cs_new_protected:Npn \@@_branch:n #1 { \@@_build_new_state: \seq_get:NN \l_@@_left_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_left_state_int \l_@@_internal_a_tl \@@_build_transition_right:nNn \@@_action_free:n \l_@@_left_state_int \l_@@_right_state_int #1 \seq_get:NN \l_@@_right_state_seq \l_@@_internal_a_tl \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_internal_a_tl } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_repeat:nn} % This function is called to repeat a group a fixed number of times % |#2|; if this is $0$ we remove the group altogether (but don't reset % the \texttt{capturing_group} label). Otherwise, the auxiliary % \cs{@@_group_repeat_aux:n} copies |#2| times the \tn{toks} for % the group, and leaves \texttt{internal_a} pointing to the left end % of the last repetition. We only record the submatch information at % the last repetition. Finally, add a state at the end (the transition % to it has been taken care of by the replicating auxiliary). % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nn #1#2 { \if_int_compare:w #2 = \c_zero_int \int_set:Nn \l_@@_max_state_int { \l_@@_left_state_int - \c_one_int } \@@_build_new_state: \else: \@@_group_repeat_aux:n {#2} \@@_group_submatches:nNN {#1} \l_@@_internal_a_int \l_@@_right_state_int \@@_build_new_state: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_submatches:nNN} % This inserts in states |#2| and |#3| the code for tracking % submatches of the group |#1|, unless inhibited by a label of $-1$. % \begin{macrocode} \cs_new_protected:Npn \@@_group_submatches:nNN #1#2#3 { \if_int_compare:w #1 > - \c_one_int \@@_toks_put_left:Ne #2 { \@@_action_submatch:nN {#1} < } \@@_toks_put_left:Ne #3 { \@@_action_submatch:nN {#1} > } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_repeat_aux:n} % Here we repeat \tn{toks} ranging from \texttt{left_state} to % \texttt{max_state}, $|#1|>0$ times. First add a transition so that % the copies \enquote{chain} properly. Compute the shift % \texttt{c} between the original copy and the last copy we % want. Shift the \texttt{right_state} and \texttt{max_state} to their % final values. We then want to perform \texttt{c} copy operations. At % the end, \texttt{b} is equal to the \texttt{max_state}, and % \texttt{a} points to the left of the last copy of the group. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat_aux:n #1 { \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int \int_set_eq:NN \l_@@_internal_b_int \l_@@_max_state_int \if_int_compare:w \@@_int_eval:w #1 > \c_one_int \int_set:Nn \l_@@_internal_c_int { ( #1 - \c_one_int ) * ( \l_@@_internal_b_int - \l_@@_internal_a_int ) } \int_add:Nn \l_@@_right_state_int \l_@@_internal_c_int \int_add:Nn \l_@@_max_state_int \l_@@_internal_c_int \@@_toks_memcpy:NNn \l_@@_internal_b_int \l_@@_internal_a_int \l_@@_internal_c_int \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_repeat:nnN} % This function is called to repeat a group at least $n$ times; the % case $n=0$ is very different from $n>0$. Assume first that $n=0$. % Insert submatch tracking information at the start and end of the % group, add a free transition from the right end to the % \enquote{true} left state \texttt{a} (remember: in this case we had % added an extra state before the left state). This forms the loop, % which we break away from by adding a free transition from \texttt{a} % to a new state. % % Now consider the case $n>0$. Repeat the group $n$ times, chaining % various copies with a free transition. Add submatch tracking only to % the last copy, then add a free transition from the right end back to % the left end of the last copy, either before or after the transition % to move on towards the rest of the \textsc{nfa}. This transition can % end up before submatch tracking, but that is irrelevant since it % only does so when going again through the group, recording new % matches. Finally, add a state; we already have a transition pointing % to it from \cs{@@_group_repeat_aux:n}. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nnN #1#2#3 { \if_int_compare:w #2 = \c_zero_int \@@_group_submatches:nNN {#1} \l_@@_left_state_int \l_@@_right_state_int \int_set:Nn \l_@@_internal_a_int { \l_@@_left_state_int - \c_one_int } \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_internal_a_int \@@_build_new_state: \if_meaning:w \c_true_bool #3 \@@_build_transition_left:NNN \@@_action_free:n \l_@@_internal_a_int \l_@@_right_state_int \else: \@@_build_transition_right:nNn \@@_action_free:n \l_@@_internal_a_int \l_@@_right_state_int \fi: \else: \@@_group_repeat_aux:n {#2} \@@_group_submatches:nNN {#1} \l_@@_internal_a_int \l_@@_right_state_int \if_meaning:w \c_true_bool #3 \@@_build_transition_right:nNn \@@_action_free_group:n \l_@@_right_state_int \l_@@_internal_a_int \else: \@@_build_transition_left:NNN \@@_action_free_group:n \l_@@_right_state_int \l_@@_internal_a_int \fi: \@@_build_new_state: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_repeat:nnnN} % We wish to repeat the group between |#2| and $|#2|+|#3|$ times, with % a laziness controlled by |#4|. We insert submatch tracking up front: % in principle, we could avoid recording submatches for the first |#2| % copies of the group, but that forces us to treat specially the case % $|#2|=0$. Repeat that group with submatch tracking $|#2|+|#3|$ times % (the maximum number of repetitions). Then our goal is to add |#3| % transitions from the end of the |#2|-th group, and each subsequent % groups, to the end. For a lazy quantifier, we add those transitions % to the left states, before submatch tracking. For the greedy case, % we add the transitions to the right states, after submatch tracking % and the transitions which go on with more repetitions. In the greedy % case with $|#2|=0$, the transition which skips over all copies of % the group must be added separately, because its starting state does % not follow the normal pattern: we had to add it \enquote{by hand} % earlier. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nnnN #1#2#3#4 { \@@_group_submatches:nNN {#1} \l_@@_left_state_int \l_@@_right_state_int \@@_group_repeat_aux:n { #2 + #3 } \if_meaning:w \c_true_bool #4 \int_set_eq:NN \l_@@_left_state_int \l_@@_max_state_int \prg_replicate:nn { #3 } { \int_sub:Nn \l_@@_left_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \@@_build_transition_left:NNN \@@_action_free:n \l_@@_left_state_int \l_@@_max_state_int } \else: \prg_replicate:nn { #3 - \c_one_int } { \int_sub:Nn \l_@@_right_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int } \if_int_compare:w #2 = \c_zero_int \int_set:Nn \l_@@_right_state_int { \l_@@_left_state_int - \c_one_int } \else: \int_sub:Nn \l_@@_right_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \fi: \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int \fi: \@@_build_new_state: } % \end{macrocode} % \end{macro} % % \subsubsection{Others} % % \begin{macro}{\@@_assertion:Nn, \@@_b_test:, \@@_A_test:, \@@_G_test:, \@@_Z_test:} % Usage: \cs{@@_assertion:Nn} \meta{boolean} \Arg{test}, where the % \meta{test} is either of the two other functions. Add a free % transition to a new state, conditionally to the assertion test. The % \cs{@@_b_test:} test is used by the |\b| and |\B| escape: check % if the last character was a word character or not, and do the same % to the current character. The boundary-markers of the string are % non-word characters for this purpose. % \begin{macrocode} \cs_new_protected:Npn \@@_assertion:Nn #1#2 { \@@_build_new_state: \@@_toks_put_right:Ne \l_@@_left_state_int { \exp_not:n {#2} \@@_break_point:TF \bool_if:NF #1 { { } } { \@@_action_free:n { \tex_the:D \@@_int_eval:w \l_@@_right_state_int - \l_@@_left_state_int } } \bool_if:NT #1 { { } } } } \cs_new_protected:Npn \@@_b_test: { \group_begin: \int_set_eq:NN \l_@@_curr_char_int \l_@@_last_char_int \@@_prop_w: \@@_break_point:TF { \group_end: \@@_item_reverse:n { \@@_prop_w: } } { \group_end: \@@_prop_w: } } \cs_new_protected:Npn \@@_Z_test: { \if_int_compare:w -2 = \l_@@_curr_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_A_test: { \if_int_compare:w -2 = \l_@@_last_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_G_test: { \if_int_compare:w \l_@@_curr_pos_int = \l_@@_start_pos_int \exp_after:wN \@@_break_true:w \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_command_K:} % Change the starting point of the $0$-th submatch (full match), and % transition to a new state, pretending that this is a fresh thread. % \begin{macrocode} \cs_new_protected:Npn \@@_command_K: { \@@_build_new_state: \@@_toks_put_right:Ne \l_@@_left_state_int { \@@_action_submatch:nN \c_zero_int < \bool_set_true:N \l_@@_fresh_thread_bool \@@_action_free:n { \tex_the:D \@@_int_eval:w \l_@@_right_state_int - \l_@@_left_state_int } \bool_set_false:N \l_@@_fresh_thread_bool } } % \end{macrocode} % \end{macro} % % \subsection{Matching} % % We search for matches by running all the execution threads through the % \textsc{nfa} in parallel, reading one token of the query at each step. % The \textsc{nfa} contains \enquote{free} transitions to other states, % and transitions which \enquote{consume} the current token. For free % transitions, the instruction at the new state of the \textsc{nfa} is % performed immediately. When a transition consumes a character, the % new state is appended to a list of \enquote{active states}, stored in % \cs{g_@@_thread_info_intarray} (together with submatch information): % this thread is made active again when the next % token is read from the query. At every step (for each token in the % query), we unpack that list of active states and the corresponding % submatch props, and empty those. % % If two paths through the \textsc{nfa} \enquote{collide} in the sense % that they reach the same state after reading a given token, then they % only differ in how they previously matched, and any future execution % would be identical for both. (Note that this would be wrong in the % presence of back-references.) Hence, we only need to keep one of the % two threads: the thread with the highest priority. Our \textsc{nfa} is % built in such a way that higher priority actions always come before % lower priority actions, which makes things work. % % The explanation in the previous paragraph may make us think that we % simply need to keep track of which states were visited at a given % step: after all, the loop generated when matching |(a?)*| against |a| % is broken, isn't it? No. The group first matches |a|, as it should, % then repeats; it attempts to match |a| again but fails; it skips |a|, % and finds out that this state has already been seen at this position % in the query: the match stops. The capturing group is (wrongly) |a|. % What went wrong is that a thread collided with itself, and the later % version, which has gone through the group one more times with an empty % match, should have a higher priority than not going through the group. % % We solve this by distinguishing \enquote{normal} free transitions % \cs{@@_action_free:n} from transitions % \cs{@@_action_free_group:n} which go back to the start of the % group. The former keeps threads unless they have been visited by a % \enquote{completed} thread, while the latter kind of transition also % prevents going back to a state visited by the current thread. % % \subsubsection{Variables used when matching} % % \begin{variable} % { % \l_@@_min_pos_int, % \l_@@_max_pos_int, % \l_@@_curr_pos_int, % \l_@@_start_pos_int, % \l_@@_success_pos_int, % } % The tokens in the query are indexed from \texttt{min_pos} for the % first to $\texttt{max_pos}-1$ for the last, and their information is % stored in several arrays and \tn{toks} registers with those numbers. % We match % without backtracking, keeping all threads in lockstep at the % \texttt{curr_pos} in the query. The starting point of the current % match attempt is \texttt{start_pos}, and \texttt{success_pos}, % updated whenever a thread succeeds, is used as the next starting % position. % \begin{macrocode} \int_new:N \l_@@_min_pos_int \int_new:N \l_@@_max_pos_int \int_new:N \l_@@_curr_pos_int \int_new:N \l_@@_start_pos_int \int_new:N \l_@@_success_pos_int % \end{macrocode} % \end{variable} % % \begin{variable} % { % \l_@@_curr_char_int, % \l_@@_curr_catcode_int, % \l_@@_curr_token_tl, % \l_@@_last_char_int, % \l_@@_last_char_success_int, % \l_@@_case_changed_char_int % } % The character and category codes of the token at the current % position and a token list expanding to that token; the character % code of the token at the previous position; % the character code of the token just before a successful match; % and the character code of the result of changing the case of the % current token (|A-Z|$\leftrightarrow$|a-z|). This last integer is % only computed when necessary, and is otherwise \cs{c_max_int}. The % \texttt{curr_char} variable is also used in various other phases % to hold a character code. % \begin{macrocode} \int_new:N \l_@@_curr_char_int \int_new:N \l_@@_curr_catcode_int \tl_new:N \l_@@_curr_token_tl \int_new:N \l_@@_last_char_int \int_new:N \l_@@_last_char_success_int \int_new:N \l_@@_case_changed_char_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_curr_state_int} % For every character in the token list, each of the active states is % considered in turn. The variable \cs{l_@@_curr_state_int} % holds the state of the \textsc{nfa} which is currently considered: % transitions are then given as shifts relative to the current state. % \begin{macrocode} \int_new:N \l_@@_curr_state_int % \end{macrocode} % \end{variable} % % \begin{variable} % {\l_@@_curr_submatches_tl, \l_@@_success_submatches_tl} % The submatches for the thread which is currently active are stored % in the \texttt{curr_submatches} list, which is almost a comma list, % but ends with a comma. This list is stored by \cs{@@_store_state:n} % into an intarray variable, to be retrieved when matching at the next % position. When a thread succeeds, this list is copied to % \cs{l_@@_success_submatches_tl}: only the last successful thread % remains there. % \begin{macrocode} \tl_new:N \l_@@_curr_submatches_tl \tl_new:N \l_@@_success_submatches_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_step_int} % This integer, always even, is increased every time a character in % the query is read, and not reset when doing multiple matches. We % store in \cs{g_@@_state_active_intarray} the last step in which each % \meta{state} in the \textsc{nfa} was encountered. This lets us break % infinite loops by not visiting the same state twice in the same % step. In fact, the step we store is equal to \texttt{step} when we % have started performing the operations of \tn{toks}\meta{state}, but % not finished yet. However, once we finish, we store % $\text{\texttt{step}}+1$ in \cs{g_@@_state_active_intarray}. This is % needed to track submatches % properly (see building phase). The \texttt{step} is also used to % attach each set of submatch information to a given iteration (and % automatically discard it when it corresponds to a past step). % \begin{macrocode} \int_new:N \l_@@_step_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_min_thread_int, \l_@@_max_thread_int} % All the currently active threads are kept in order of precedence in % \cs{g_@@_thread_info_intarray} together with the corresponding % submatch information. Data in this intarray is organized as blocks % from \texttt{min_thread} (included) to \texttt{max_thread} % (excluded). At the start of every step, the whole array is % unpacked, so that the space can immediately be reused, and % \texttt{max_thread} is reset to \texttt{min_thread}, effectively % clearing the array. % \begin{macrocode} \int_new:N \l_@@_min_thread_int \int_new:N \l_@@_max_thread_int % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_state_active_intarray, \g_@@_thread_info_intarray} % \cs{g_@@_state_active_intarray} stores the last \meta{step} in which % each \meta{state} was active. \cs{g_@@_thread_info_intarray} stores % threads to be considered in the next step, more precisely the % states in which these threads are. % \begin{macrocode} \intarray_new:Nn \g_@@_state_active_intarray { 65536 } \intarray_new:Nn \g_@@_thread_info_intarray { 65536 } % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_matched_analysis_tl, \l_@@_curr_analysis_tl} % The list \cs{l_@@_curr_analysis_tl} consists of a brace group % containing three brace groups corresponding to the current token, % with the same syntax as \cs{tl_analysis_map_inline:nn}. The list % \cs{l_@@_matched_analysis_tl} (constructed under the % \texttt{tl\_build} machinery) has one item for each token that has % already been treated so far in a given match attempt: each item % consists of three brace groups with the same syntax as % \cs{tl_analysis_map_inline:nn}. % \begin{macrocode} \tl_new:N \l_@@_matched_analysis_tl \tl_new:N \l_@@_curr_analysis_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_every_match_tl} % Every time a match is found, this token list is used. For single % matching, the token list is empty. For multiple matching, the token % list is set to repeat the matching, after performing some operation % which depends on the user function. See \cs{@@_single_match:} and % \cs{@@_multi_match:n}. % \begin{macrocode} \tl_new:N \l_@@_every_match_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_fresh_thread_bool, \l_@@_empty_success_bool} % \begin{macro}{\@@_if_two_empty_matches:F} % When doing multiple matches, we need to avoid infinite loops where % each iteration matches the same empty token list. When an empty % token list is matched, the next successful match of the same empty % token list is suppressed. We detect empty matches by setting % \cs{l_@@_fresh_thread_bool} to \texttt{true} for threads which % directly come from the start of the regex or from the |\K| command, % and testing that boolean whenever a thread succeeds. The function % \cs{@@_if_two_empty_matches:F} is redefined at every match % attempt, depending on whether the previous match was empty or not: % if it was, then the function must cancel a purported success if it % is empty and at the same spot as the previous match; otherwise, we % definitely don't have two identical empty matches, so the function % is \cs{use:n}. % \begin{macrocode} \bool_new:N \l_@@_fresh_thread_bool \bool_new:N \l_@@_empty_success_bool \cs_new_eq:NN \@@_if_two_empty_matches:F \use:n % \end{macrocode} % \end{macro} % \end{variable} % % \begin{variable} % { % \g_@@_success_bool, % \l_@@_saved_success_bool, % \l_@@_match_success_bool % } % The boolean \cs{l_@@_match_success_bool} is true if the current % match attempt was successful, and \cs{g_@@_success_bool} is true % if there was at least one successful match. This is the only global % variable in this whole module, but we would need it to be local when % matching a control sequence with |\c{...}|. This is done by saving % the global variable into \cs{l_@@_saved_success_bool}, which is % local, hence not affected by the changes due to inner regex % functions. % \begin{macrocode} \bool_new:N \g_@@_success_bool \bool_new:N \l_@@_saved_success_bool \bool_new:N \l_@@_match_success_bool % \end{macrocode} % \end{variable} % % \subsubsection{Matching: framework} % % \begin{macro}{\@@_match:n, \@@_match_cs:n} % \begin{macro}{\@@_match_init:} % Initialize the variables that should % be set once for each user function (even for multiple % matches). Namely, the overall matching is not yet successful; none of % the states should be marked as visited (\cs{g_@@_state_active_intarray}), and % we start at step $0$; we pretend that there was a previous match % ending at the start of the query, which was not empty (to avoid % smothering an empty match at the start). Once all this is set up, we % are ready for the ride. Find the first match. % \begin{macrocode} \cs_new_protected:Npn \@@_match:n #1 { \@@_match_init: \@@_match_once_init: \tl_analysis_map_inline:nn {#1} { \@@_match_one_token:nnN {##1} {##2} ##3 } \@@_match_one_token:nnN { } { -2 } F \prg_break_point:Nn \@@_maplike_break: { } } \cs_new_protected:Npn \@@_match_cs:n #1 { \int_set_eq:NN \l_@@_min_thread_int \l_@@_max_thread_int \@@_match_init: \@@_match_once_init: \str_map_inline:nn {#1} { \tl_if_blank:nTF {##1} { \@@_match_one_token:nnN {##1} {`##1} A } { \@@_match_one_token:nnN {##1} {`##1} C } } \@@_match_one_token:nnN { } { -2 } F \prg_break_point:Nn \@@_maplike_break: { } } \cs_new_protected:Npn \@@_match_init: { \bool_gset_false:N \g_@@_success_bool \int_step_inline:nnn \l_@@_min_state_int { \l_@@_max_state_int - \c_one_int } { \__kernel_intarray_gset:Nnn \g_@@_state_active_intarray {##1} \c_one_int } \int_zero:N \l_@@_step_int \int_set:Nn \l_@@_min_pos_int { 2 } \int_set_eq:NN \l_@@_success_pos_int \l_@@_min_pos_int \int_set:Nn \l_@@_last_char_success_int { -2 } \tl_build_begin:N \l_@@_matched_analysis_tl \tl_clear:N \l_@@_curr_analysis_tl \int_set_eq:NN \l_@@_min_submatch_int \c_one_int \int_set_eq:NN \l_@@_submatch_int \l_@@_min_submatch_int \bool_set_false:N \l_@@_empty_success_bool } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_match_once_init:} % This function resets various variables used when finding one match. % It is called before the loop through characters, and every time we % find a match, before searching for another match (this is controlled % by the \texttt{every_match} token list). % % First initialize some variables: set the % conditional which detects identical empty matches; this match % attempt starts at the previous \texttt{success_pos}, is not yet % successful, and has no submatches yet; clear the array of active % threads, and put the starting state $0$ in it. We are then almost % ready to read our first token in the query, but we actually start % one position earlier than the start because % \cs{@@_match_one_token:nnN} increments \cs{l_@@_curr_pos_int} and % saves \cs{l_@@_curr_char_int} as the \texttt{last_char} so that word % boundaries can be correctly identified. % \begin{macrocode} \cs_new_protected:Npn \@@_match_once_init: { \if_meaning:w \c_true_bool \l_@@_empty_success_bool \cs_set:Npn \@@_if_two_empty_matches:F { \int_compare:nNnF \l_@@_start_pos_int = \l_@@_curr_pos_int } \else: \cs_set_eq:NN \@@_if_two_empty_matches:F \use:n \fi: \int_set_eq:NN \l_@@_start_pos_int \l_@@_success_pos_int \bool_set_false:N \l_@@_match_success_bool \tl_set:Ne \l_@@_curr_submatches_tl { \prg_replicate:nn { 2 * \l_@@_capturing_group_int } { 0 , } } \int_set_eq:NN \l_@@_max_thread_int \l_@@_min_thread_int \@@_store_state:n { \l_@@_min_state_int } \int_set:Nn \l_@@_curr_pos_int { \l_@@_start_pos_int - \c_one_int } \int_set_eq:NN \l_@@_curr_char_int \l_@@_last_char_success_int \tl_build_get_intermediate:NN \l_@@_matched_analysis_tl \l_@@_internal_a_tl \exp_args:NNf \@@_match_once_init_aux: \tl_map_inline:nn { \exp_after:wN \l_@@_internal_a_tl \l_@@_curr_analysis_tl } { \@@_match_one_token:nnN ##1 } \prg_break_point:Nn \@@_maplike_break: { } } \cs_new_protected:Npn \@@_match_once_init_aux: { \tl_build_begin:N \l_@@_matched_analysis_tl \tl_clear:N \l_@@_curr_analysis_tl } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_single_match:, \@@_multi_match:n} % For a single match, the overall success is determined by whether the % only match attempt is a success. When doing multiple matches, the % overall matching is successful as soon as any match % succeeds. Perform the action |#1|, then find the next match. % \begin{macrocode} \cs_new_protected:Npn \@@_single_match: { \tl_set:Nn \l_@@_every_match_tl { \bool_gset_eq:NN \g_@@_success_bool \l_@@_match_success_bool \@@_maplike_break: } } \cs_new_protected:Npn \@@_multi_match:n #1 { \tl_set:Nn \l_@@_every_match_tl { \if_meaning:w \c_false_bool \l_@@_match_success_bool \exp_after:wN \@@_maplike_break: \fi: \bool_gset_true:N \g_@@_success_bool #1 \@@_match_once_init: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_match_one_token:nnN} % \begin{macro}[rEXP]{\@@_match_one_active:n} % At each new position, set some variables and get the new character % and category from the query. Then unpack the array of active % threads, and clear it by resetting its length % (\texttt{max_thread}). This results in a sequence of % \cs{@@_use_state_and_submatches:w} \meta{state}|,|\meta{submatch-clist}|;| and % we consider those states one by one in order. As soon as a thread % succeeds, exit the step, and, if there are threads to consider at the % next position, and we have not reached the end of the string, % repeat the loop. Otherwise, the last thread that succeeded is the % match. We explain the \texttt{fresh_thread} business when % describing \cs{@@_action_wildcard:}. % \begin{macrocode} \cs_new_protected:Npn \@@_match_one_token:nnN #1#2#3 { \int_add:Nn \l_@@_step_int { 2 } \int_incr:N \l_@@_curr_pos_int \int_set_eq:NN \l_@@_last_char_int \l_@@_curr_char_int \cs_set_eq:NN \@@_maybe_compute_ccc: \@@_compute_case_changed_char: \tl_set:Nn \l_@@_curr_token_tl {#1} \int_set:Nn \l_@@_curr_char_int {#2} \int_set:Nn \l_@@_curr_catcode_int { "#3 } \tl_build_put_right:Ne \l_@@_matched_analysis_tl { \exp_not:o \l_@@_curr_analysis_tl } \tl_set:Nn \l_@@_curr_analysis_tl { { {#1} {#2} #3 } } \use:e { \int_set_eq:NN \l_@@_max_thread_int \l_@@_min_thread_int \int_step_function:nnN \l_@@_min_thread_int { \l_@@_max_thread_int - \c_one_int } \@@_match_one_active:n } \prg_break_point: \bool_set_false:N \l_@@_fresh_thread_bool \if_int_compare:w \l_@@_max_thread_int > \l_@@_min_thread_int \if_int_compare:w -2 < \l_@@_curr_char_int \exp_after:wN \use_i:nn \fi: \fi: \l_@@_every_match_tl } \cs_new:Npn \@@_match_one_active:n #1 { \@@_use_state_and_submatches:w \__kernel_intarray_range_to_clist:Nnn \g_@@_thread_info_intarray { \c_one_int + #1 * (\l_@@_capturing_group_int * 2 + \c_one_int) } { (\c_one_int + #1) * (\l_@@_capturing_group_int * 2 + \c_one_int) } ; } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Using states of the \textsc{nfa}} % % \begin{macro}{\@@_use_state:} % Use the current \textsc{nfa} instruction. The state is initially % marked as belonging to the current \texttt{step}: this allows normal % free transition to repeat, but group-repeating transitions % won't. Once we are done exploring all the branches it spawned, the % state is marked as $\texttt{step}+1$: any thread hitting it at that % point will be terminated. % \begin{macrocode} \cs_new_protected:Npn \@@_use_state: { \__kernel_intarray_gset:Nnn \g_@@_state_active_intarray \l_@@_curr_state_int \l_@@_step_int \@@_toks_use:w \l_@@_curr_state_int \__kernel_intarray_gset:Nnn \g_@@_state_active_intarray \l_@@_curr_state_int { \@@_int_eval:w \l_@@_step_int + \c_one_int \scan_stop: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_use_state_and_submatches:w} % This function is called as one item in the array of active threads % after that array has been unpacked for a new step. Update the % \texttt{curr_state} and \texttt{curr_submatches} and use the % state if it has not yet been encountered at this step. % \begin{macrocode} \cs_new_protected:Npn \@@_use_state_and_submatches:w #1 , #2 ; { \int_set:Nn \l_@@_curr_state_int {#1} \if_int_compare:w \__kernel_intarray_item:Nn \g_@@_state_active_intarray \l_@@_curr_state_int < \l_@@_step_int \tl_set:Nn \l_@@_curr_submatches_tl { #2 , } \exp_after:wN \@@_use_state: \fi: \scan_stop: } % \end{macrocode} % \end{macro} % % \subsubsection{Actions when matching} % % \begin{macro}{\@@_action_start_wildcard:N} % For an unanchored match, state $0$ has a free transition to the next % and a costly one to itself, to repeat at the next position. To catch % repeated identical empty matches, we need to know if a successful % thread corresponds to an empty match. The instruction resetting % \cs{l_@@_fresh_thread_bool} may be skipped by a successful % thread, hence we had to add it to \cs{@@_match_one_token:nnN} too. % \begin{macrocode} \cs_new_protected:Npn \@@_action_start_wildcard:N #1 { \bool_set_true:N \l_@@_fresh_thread_bool \@@_action_free:n {1} \bool_set_false:N \l_@@_fresh_thread_bool \bool_if:NT #1 { \@@_action_cost:n {0} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_action_free:n, \@@_action_free_group:n} % \begin{macro}{\@@_action_free_aux:nn} % These functions copy a thread after checking that the \textsc{nfa} % state has not already been used at this position. If not, store % submatches in the new state, and insert the instructions for that % state in the input stream. Then restore the old value of % \cs{l_@@_curr_state_int} and of the current submatches. The % two types of free transitions differ by how they test that the state % has not been encountered yet: the \texttt{group} version is % stricter, and will not use a state if it was used earlier in the % current thread, hence forcefully breaking the loop, while the % \enquote{normal} version will revisit a state even within the thread % itself. % \begin{macrocode} \cs_new_protected:Npn \@@_action_free:n { \@@_action_free_aux:nn { > \l_@@_step_int \else: } } \cs_new_protected:Npn \@@_action_free_group:n { \@@_action_free_aux:nn { < \l_@@_step_int } } \cs_new_protected:Npn \@@_action_free_aux:nn #1#2 { \use:e { \int_add:Nn \l_@@_curr_state_int {#2} \exp_not:n { \if_int_compare:w \__kernel_intarray_item:Nn \g_@@_state_active_intarray \l_@@_curr_state_int #1 \exp_after:wN \@@_use_state: \fi: } \int_set:Nn \l_@@_curr_state_int { \int_use:N \l_@@_curr_state_int } \tl_set:Nn \exp_not:N \l_@@_curr_submatches_tl { \exp_not:o \l_@@_curr_submatches_tl } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_action_cost:n} % A transition which consumes the current character and shifts the % state by |#1|. The resulting state is stored in the appropriate array % for use at the next position, and we also store the current % submatches. % \begin{macrocode} \cs_new_protected:Npn \@@_action_cost:n #1 { \exp_args:No \@@_store_state:n { \tex_the:D \@@_int_eval:w \l_@@_curr_state_int + #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_store_state:n} % \begin{macro}{\@@_store_submatches:} % Put the given state and current submatch information in % \cs{g_@@_thread_info_intarray}, and increment the length of the % array. % \begin{macrocode} \cs_new_protected:Npn \@@_store_state:n #1 { \exp_args:No \@@_store_submatches:nn \l_@@_curr_submatches_tl {#1} \int_incr:N \l_@@_max_thread_int } \cs_new_protected:Npn \@@_store_submatches:nn #1#2 { \__kernel_intarray_gset_range_from_clist:Nnn \g_@@_thread_info_intarray { \@@_int_eval:w \c_one_int + \l_@@_max_thread_int * (\l_@@_capturing_group_int * 2 + \c_one_int) } { #2 , #1 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_disable_submatches:} % Some user functions don't require tracking submatches. % We get a performance improvement by simply defining the % relevant functions to remove their argument and do nothing % with it. % \begin{macrocode} \cs_new_protected:Npn \@@_disable_submatches: { \cs_set_protected:Npn \@@_store_submatches:n ##1 { } \cs_set_protected:Npn \@@_action_submatch:nN ##1##2 { } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_action_submatch:nN, \@@_action_submatch_aux:w, \@@_action_submatch_auxii:w, \@@_action_submatch_auxiii:w, \@@_action_submatch_auxiv:w} % Update the current submatches with the information from the current % position. Maybe a bottleneck. % \begin{macrocode} \cs_new_protected:Npn \@@_action_submatch:nN #1#2 { \exp_after:wN \@@_action_submatch_aux:w \l_@@_curr_submatches_tl ; {#1} #2 } \cs_new_protected:Npn \@@_action_submatch_aux:w #1 ; #2#3 { \tl_set:Ne \l_@@_curr_submatches_tl { \prg_replicate:nn { #2 \if_meaning:w > #3 + \l_@@_capturing_group_int \fi: } { \@@_action_submatch_auxii:w } \@@_action_submatch_auxiii:w #1 } } \cs_new:Npn \@@_action_submatch_auxii:w #1 \@@_action_submatch_auxiii:w #2 , { #2 , #1 \@@_action_submatch_auxiii:w } \cs_new:Npn \@@_action_submatch_auxiii:w #1 , { \int_use:N \l_@@_curr_pos_int , } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_action_success:} % There is a successful match when an execution path reaches the last % state in the \textsc{nfa}, unless this marks a second identical % empty match. Then mark that there was a successful match; it is % empty if it is \enquote{fresh}; and we store the current position % and submatches. The current step is then interrupted with % \cs{prg_break:}, and only paths with higher precedence are % pursued further. The values stored here may be overwritten by a % later success of a path with higher precedence. % \begin{macrocode} \cs_new_protected:Npn \@@_action_success: { \@@_if_two_empty_matches:F { \bool_set_true:N \l_@@_match_success_bool \bool_set_eq:NN \l_@@_empty_success_bool \l_@@_fresh_thread_bool \int_set_eq:NN \l_@@_success_pos_int \l_@@_curr_pos_int \int_set_eq:NN \l_@@_last_char_success_int \l_@@_last_char_int \tl_build_begin:N \l_@@_matched_analysis_tl \tl_set_eq:NN \l_@@_success_submatches_tl \l_@@_curr_submatches_tl \prg_break: } } % \end{macrocode} % \end{macro} % % \subsection{Replacement} % % \subsubsection{Variables and helpers used in replacement} % % \begin{variable}{\l_@@_replacement_csnames_int} % The behaviour of closing braces inside a replacement text depends on % whether a sequences |\c{| or |\u{| has been encountered. The number % of \enquote{open} such sequences that should be closed by |}| is % stored in \cs{l_@@_replacement_csnames_int}, and decreased by % $1$ by each |}|. % \begin{macrocode} \int_new:N \l_@@_replacement_csnames_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_replacement_category_tl, \l_@@_replacement_category_seq} % This sequence of letters is used to correctly restore categories in % nested constructions such as |\cL(abc\cD(_)d)|. % \begin{macrocode} \tl_new:N \l_@@_replacement_category_tl \seq_new:N \l_@@_replacement_category_seq % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_balance_tl} % This token list holds the replacement text for % \cs{@@_replacement_balance_one_match:n} while it is being built % incrementally. % \begin{macrocode} \tl_new:N \g_@@_balance_tl % \end{macrocode} % \end{variable} % % \begin{macro}[rEXP]{\@@_replacement_balance_one_match:n} % This expects as an argument the first index of a set of entries in % \cs{g_@@_submatch_begin_intarray} (and related arrays) which hold the % submatch information for a given match. It % can be used within an integer expression to obtain the brace balance % incurred by performing the replacement on that match. This combines % the braces lost by removing the match, braces added by all the % submatches appearing in the replacement, and braces appearing % explicitly in the replacement. Even though it is always redefined % before use, we initialize it as for an empty replacement. An % important property is that concatenating several calls to that % function must result in a valid integer expression (hence a leading % |+| in the actual definition). % \begin{macrocode} \cs_new:Npn \@@_replacement_balance_one_match:n #1 { - \@@_submatch_balance:n {#1} } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_replacement_do_one_match:n} % The input is the same as \cs{@@_replacement_balance_one_match:n}. % This function is redefined to expand to the part of the token list % from the end of the previous match to a given match, followed by the % replacement text. Hence concatenating the result of this function % with all possible arguments (one call for each match), as well as % the range from the end of the last match to the end of the string, % produces the fully replaced token list. The initialization does % not matter, but (as an example) we set it as for an empty replacement. % \begin{macrocode} \cs_new:Npn \@@_replacement_do_one_match:n #1 { \@@_query_range:nn { \__kernel_intarray_item:Nn \g_@@_submatch_prev_intarray {#1} } { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_exp_not:N} % This function lets us navigate around the fact that the primitive % \cs{exp_not:n} requires a braced argument. As far as I can tell, it % is only needed if the user tries to include in the replacement text % a control sequence set equal to a macro parameter character, such as % \cs{c_parameter_token}. Indeed, within an \texttt{e}/\texttt{x}-expanding % assignment, \cs{exp_not:N}~|#| behaves as a single |#|, whereas % \cs{exp_not:n}~|{#}| behaves as a doubled |##|. % \begin{macrocode} \cs_new:Npn \@@_replacement_exp_not:N #1 { \exp_not:n {#1} } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_exp_not:V} % This is used for the implementation of~|\u|, and it gets redefined % for \cs{peek_regex_replace_once:nnTF}. % \begin{macrocode} \cs_new_eq:NN \@@_replacement_exp_not:V \exp_not:V % \end{macrocode} % \end{macro} % % \subsubsection{Query and brace balance} % % \begin{macro}[rEXP]{\@@_query_range:nn} % \begin{macro}[rEXP]{\@@_query_range_loop:ww} % When it is time to extract submatches from the token list, the % various tokens are stored in \tn{toks} registers numbered from % \cs{l_@@_min_pos_int} inclusive to \cs{l_@@_max_pos_int} % exclusive. The function \cs{@@_query_range:nn} \Arg{min} % \Arg{max} unpacks registers from the position \meta{min} to the % position $\meta{max}-1$ included. Once this is expanded, a second % \texttt{e}-expansion results in the actual tokens from the % query. That second expansion is only done by user functions at the % very end of their operation, after checking (and correcting) the % brace balance first. % \begin{macrocode} \cs_new:Npn \@@_query_range:nn #1#2 { \exp_after:wN \@@_query_range_loop:ww \int_value:w \@@_int_eval:w #1 \exp_after:wN ; \int_value:w \@@_int_eval:w #2 ; \prg_break_point: } \cs_new:Npn \@@_query_range_loop:ww #1 ; #2 ; { \if_int_compare:w #1 < #2 \exp_stop_f: \else: \prg_break:n \fi: \@@_toks_use:w #1 \exp_stop_f: \exp_after:wN \@@_query_range_loop:ww \int_value:w \@@_int_eval:w #1 + \c_one_int ; #2 ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_query_submatch:n} % Find the start and end positions for a given submatch (of a given match). % \begin{macrocode} \cs_new:Npn \@@_query_submatch:n #1 { \@@_query_range:nn { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {#1} } { \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_submatch_balance:n} % Every user function must result in a balanced token list (unbalanced % token lists cannot be stored by TeX). When we unpacked the query, we % kept track of the brace balance, hence the contribution from a given % range is the difference between the brace balances at the % \meta{max~pos} and \meta{min~pos}. These two positions are found in % the corresponding \enquote{submatch} arrays. % \begin{macrocode} \cs_new_protected:Npn \@@_submatch_balance:n #1 { \tex_the:D \@@_int_eval:w \@@_intarray_item:NnF \g_@@_balance_intarray { \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray {#1} } \c_zero_int - \@@_intarray_item:NnF \g_@@_balance_intarray { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {#1} } \c_zero_int \scan_stop: } % \end{macrocode} % \end{macro} % % \subsubsection{Framework} % % \begin{macro}{\@@_replacement:n, \@@_replacement:e} % \begin{macro}{\@@_replacement_apply:Nn, \@@_replacement_set:n} % The replacement text is built incrementally. We keep track in % \cs{l_@@_balance_int} of the balance of explicit begin- and % end-group tokens and we store in \cs{g_@@_balance_tl} some % code to compute the brace balance from submatches (see its % description). Detect unescaped right braces, and escaped characters, % with trailing \cs{prg_do_nothing:} because some of the later % function look-ahead. Once the whole replacement text has been % parsed, make sure that there is no open csname. Finally, define the % \texttt{balance_one_match} and \texttt{do_one_match} functions. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement:n { \@@_replacement_apply:Nn \@@_replacement_set:n } \cs_new_protected:Npn \@@_replacement_apply:Nn #1#2 { \group_begin: \tl_build_begin:N \l_@@_build_tl \int_zero:N \l_@@_balance_int \tl_gclear:N \g_@@_balance_tl \@@_escape_use:nnnn { \if_charcode:w \c_right_brace_str ##1 \@@_replacement_rbrace:N \else: \if_charcode:w \c_left_brace_str ##1 \@@_replacement_lbrace:N \else: \@@_replacement_normal:n \fi: \fi: ##1 } { \@@_replacement_escaped:N ##1 } { \@@_replacement_normal:n ##1 } {#2} \prg_do_nothing: \prg_do_nothing: \if_int_compare:w \l_@@_replacement_csnames_int > \c_zero_int \msg_error:nne { regex } { replacement-missing-rbrace } { \int_use:N \l_@@_replacement_csnames_int } \tl_build_put_right:Ne \l_@@_build_tl { \prg_replicate:nn \l_@@_replacement_csnames_int \cs_end: } \fi: \seq_if_empty:NF \l_@@_replacement_category_seq { \msg_error:nne { regex } { replacement-missing-rparen } { \seq_count:N \l_@@_replacement_category_seq } \seq_clear:N \l_@@_replacement_category_seq } \tl_gput_right:Ne \g_@@_balance_tl { + \int_use:N \l_@@_balance_int } \tl_build_end:N \l_@@_build_tl \exp_args:NNo \group_end: #1 \l_@@_build_tl } \cs_generate_variant:Nn \@@_replacement:n { e } \cs_new_protected:Npn \@@_replacement_set:n #1 { \cs_set:Npn \@@_replacement_do_one_match:n ##1 { \@@_query_range:nn { \__kernel_intarray_item:Nn \g_@@_submatch_prev_intarray {##1} } { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {##1} } #1 } \exp_args:Nno \use:n { \cs_gset:Npn \@@_replacement_balance_one_match:n ##1 } { \g_@@_balance_tl - \@@_submatch_balance:n {##1} } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_case_replacement:n, \@@_case_replacement:e} % \begin{macrocode} \tl_new:N \g_@@_case_replacement_tl \tl_new:N \g_@@_case_balance_tl \cs_new_protected:Npn \@@_case_replacement:n #1 { \tl_gset:Nn \g_@@_case_balance_tl { \if_case:w \__kernel_intarray_item:Nn \g_@@_submatch_case_intarray {##1} } \tl_gset_eq:NN \g_@@_case_replacement_tl \g_@@_case_balance_tl \tl_map_tokens:nn {#1} { \@@_replacement_apply:Nn \@@_case_replacement_aux:n } \tl_gset:No \g_@@_balance_tl { \g_@@_case_balance_tl \fi: } \exp_args:No \@@_replacement_set:n { \g_@@_case_replacement_tl \fi: } } \cs_generate_variant:Nn \@@_case_replacement:n { e } \cs_new_protected:Npn \@@_case_replacement_aux:n #1 { \tl_gput_right:Nn \g_@@_case_replacement_tl { \or: #1 } \tl_gput_right:No \g_@@_case_balance_tl { \exp_after:wN \or: \g_@@_balance_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_put:n} % This gets redefined for \cs{peek_regex_replace_once:nnTF}. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_put:n { \tl_build_put_right:Nn \l_@@_build_tl } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_normal:n, \@@_replacement_normal_aux:N} % Most characters are simply sent to the output by % \cs{tl_build_put_right:Nn}, unless a particular category code has been % requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is % called. One exception is right parentheses, which restore the % category code in place before the group started. Note that the % sequence is non-empty there: it contains an empty entry % corresponding to the initial value of % \cs{l_@@_replacement_category_tl}. % The argument |#1| is a single character (including the case of a catcode-other space). % In case no specific catcode is requested, we take into account the % current catcode regime (at the time the replacement is performed) % as much as reasonable, with all impossible catcodes (escape, % newline, etc.) being mapped to \enquote{other}. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_normal:n #1 { \int_compare:nNnTF \l_@@_replacement_csnames_int > \c_zero_int { \exp_args:No \@@_replacement_put:n { \token_to_str:N #1 } } { \tl_if_empty:NTF \l_@@_replacement_category_tl { \@@_replacement_normal_aux:N #1 } { % ( \token_if_eq_charcode:NNTF #1 ) { \seq_pop:NN \l_@@_replacement_category_seq \l_@@_replacement_category_tl } { \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w } ? #1 } } } } \cs_new_protected:Npn \@@_replacement_normal_aux:N #1 { \token_if_eq_charcode:NNTF #1 \c_space_token { \@@_replacement_c_S:w } { \exp_after:wN \exp_after:wN \if_case:w \tex_catcode:D `#1 \exp_stop_f: \@@_replacement_c_O:w \or: \@@_replacement_c_B:w \or: \@@_replacement_c_E:w \or: \@@_replacement_c_M:w \or: \@@_replacement_c_T:w \or: \@@_replacement_c_O:w \or: \@@_replacement_c_P:w \or: \@@_replacement_c_U:w \or: \@@_replacement_c_D:w \or: \@@_replacement_c_O:w \or: \@@_replacement_c_S:w \or: \@@_replacement_c_L:w \or: \@@_replacement_c_O:w \or: \@@_replacement_c_A:w \else: \@@_replacement_c_O:w \fi: } ? #1 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_escaped:N} % As in parsing a regular expression, we use an auxiliary built from % |#1| if defined. Otherwise, check for escaped digits (standing from % submatches from $0$ to $9$): anything else is a raw character. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_escaped:N #1 { \cs_if_exist_use:cF { @@_replacement_#1:w } { \if_int_compare:w \c_one_int < 1#1 \exp_stop_f: \@@_replacement_put_submatch:n {#1} \else: \@@_replacement_normal:n {#1} \fi: } } % \end{macrocode} % \end{macro} % % \subsubsection{Submatches} % % \begin{macro}{\@@_replacement_put_submatch:n, \@@_replacement_put_submatch_aux:n} % Insert a submatch in the replacement text. This is dropped if the % submatch number is larger than the number of capturing groups. % Unless the submatch appears inside a |\c{...}| or |\u{...}| % construction, it must be taken into account in the brace balance. % Later on, |##1| will be replaced by a pointer to the $0$-th submatch for a % given match. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_put_submatch:n #1 { \if_int_compare:w #1 < \l_@@_capturing_group_int \@@_replacement_put_submatch_aux:n {#1} \else: \msg_expandable_error:nnff { regex } { submatch-too-big } {#1} { \int_eval:n { \l_@@_capturing_group_int - \c_one_int } } \fi: } \cs_new_protected:Npn \@@_replacement_put_submatch_aux:n #1 { \tl_build_put_right:Nn \l_@@_build_tl { \@@_query_submatch:n { \@@_int_eval:w #1 + ##1 \scan_stop: } } \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero_int \tl_gput_right:Nn \g_@@_balance_tl { + \@@_submatch_balance:n { \@@_int_eval:w #1 + ##1 \scan_stop: } } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_g:w} % \begin{macro}[rEXP]{\@@_replacement_g_digits:NN} % Grab digits for the |\g| escape sequence in a primitive assignment % to the integer \cs{l_@@_internal_a_int}. At the end of the run of % digits, check that it ends with a right brace. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_g:w #1#2 { \token_if_eq_meaning:NNTF #1 \@@_replacement_lbrace:N { \l_@@_internal_a_int = \@@_replacement_g_digits:NN } { \@@_replacement_error:NNN g #1 #2 } } \cs_new:Npn \@@_replacement_g_digits:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n { \if_int_compare:w \c_one_int < 1#2 \exp_stop_f: #2 \exp_after:wN \use_i:nnn \exp_after:wN \@@_replacement_g_digits:NN \else: \exp_stop_f: \exp_after:wN \@@_replacement_error:NNN \exp_after:wN g \fi: } { \exp_stop_f: \if_meaning:w \@@_replacement_rbrace:N #1 \exp_args:No \@@_replacement_put_submatch:n { \int_use:N \l_@@_internal_a_int } \exp_after:wN \use_none:nn \else: \exp_after:wN \@@_replacement_error:NNN \exp_after:wN g \fi: } #1 #2 } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Csnames in replacement} % % \begin{macro}{\@@_replacement_c:w} % |\c| may only be followed by an unescaped character. If followed by % a left brace, start a control sequence by calling an auxiliary % common with |\u|. Otherwise test whether the category is known; if % it is not, complain. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c:w #1#2 { \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n { \cs_if_exist:cTF { @@_replacement_c_#2:w } { \@@_replacement_cat:NNN #2 } { \@@_replacement_error:NNN c #1#2 } } { \token_if_eq_meaning:NNTF #1 \@@_replacement_lbrace:N { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:N } { \@@_replacement_error:NNN c #1#2 } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_cu_aux:Nw} % Start a control sequence with \cs{cs:w}, protected % from expansion by |#1| (either \cs{@@_replacement_exp_not:N} or % \cs{exp_not:V}), or turned to a string by \cs{tl_to_str:V} if inside % another csname construction |\c| or |\u|. We use \cs{tl_to_str:V} % rather than \cs{tl_to_str:N} to deal with integers and other % registers. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_cu_aux:Nw #1 { \if_case:w \l_@@_replacement_csnames_int \tl_build_put_right:Nn \l_@@_build_tl { \exp_not:n { \exp_after:wN #1 \cs:w } } \else: \tl_build_put_right:Nn \l_@@_build_tl { \exp_not:n { \exp_after:wN \tl_to_str:V \cs:w } } \fi: \int_incr:N \l_@@_replacement_csnames_int } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_u:w} % Check that |\u| is followed by a left brace. If so, start a control % sequence with \cs{cs:w}, which is then unpacked either with % \cs{exp_not:V} or \cs{tl_to_str:V} depending on the current context. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_u:w #1#2 { \token_if_eq_meaning:NNTF #1 \@@_replacement_lbrace:N { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:V } { \@@_replacement_error:NNN u #1#2 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_rbrace:N} % Within a |\c{...}| or |\u{...}| construction, end the control % sequence, and decrease the brace count. Otherwise, this is a raw % right brace. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_rbrace:N #1 { \if_int_compare:w \l_@@_replacement_csnames_int > \c_zero_int \tl_build_put_right:Nn \l_@@_build_tl { \cs_end: } \int_decr:N \l_@@_replacement_csnames_int \else: \@@_replacement_normal:n {#1} \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_lbrace:N} % Within a |\c{...}| or |\u{...}| construction, this is % forbidden. Otherwise, this is a raw left brace. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_lbrace:N #1 { \if_int_compare:w \l_@@_replacement_csnames_int > \c_zero_int \msg_error:nnn { regex } { cu-lbrace } { u } \else: \@@_replacement_normal:n {#1} \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Characters in replacement} % % \begin{macro}{\@@_replacement_cat:NNN} % Here, |#1| is a letter among |BEMTPUDSLOA| and |#2#3| denote the % next character. Complain if we reach the end of the replacement or % if the construction appears inside |\c{|\ldots{}|}| or % |\u{|\ldots{}|}|, and detect the case of a parenthesis. In that % case, store the current category in a sequence and switch to a new % one. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_cat:NNN #1#2#3 { \token_if_eq_meaning:NNTF \prg_do_nothing: #3 { \msg_error:nn { regex } { replacement-catcode-end } } { \int_compare:nNnTF \l_@@_replacement_csnames_int > \c_zero_int { \msg_error:nnnn { regex } { replacement-catcode-in-cs } {#1} {#3} #2 #3 } { \@@_two_if_eq:NNNNTF #2 #3 \@@_replacement_normal:n ( { \seq_push:NV \l_@@_replacement_category_seq \l_@@_replacement_category_tl \tl_set:Nn \l_@@_replacement_category_tl {#1} } { \token_if_eq_meaning:NNT #2 \@@_replacement_escaped:N { \@@_char_if_alphanumeric:NTF #3 { \msg_error:nnnn { regex } { replacement-catcode-escaped } {#1} {#3} } { } } \use:c { @@_replacement_c_#1:w } #2 #3 } } } } % \end{macrocode} % \end{macro} % % We now need to change the category code of the null character many % times, hence work in a group. The catcode-specific macros below are % defined in alphabetical order; if you are trying to understand the % code, start from the end of the alphabet as those categories are % simpler than active or begin-group. % \begin{macrocode} \group_begin: % \end{macrocode} % % \begin{macro}{\@@_replacement_char:nNN} % The only way to produce an arbitrary character--catcode pair is to % use the \tn{lowercase} or \tn{uppercase} primitives. This is a % wrapper for our purposes. The first argument is the null character % with various catcodes. The second and third arguments are grabbed % from the input stream: |#3| is the character whose character code to % reproduce. We could use \cs{char_generate:nn} but only for some % catcodes (active characters and spaces are not supported). % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_char:nNN #1#2#3 { \tex_lccode:D \c_zero_int = `#3 \scan_stop: \tex_lowercase:D { \@@_replacement_put:n {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_A:w} % For an active character, expansion must be avoided, twice because we % later do two \texttt{e}-expansions, to unpack \tn{toks} for the % query, and to expand their contents to tokens of the query. % \begin{macrocode} \char_set_catcode_active:N \^^@ \cs_new_protected:Npn \@@_replacement_c_A:w { \@@_replacement_char:nNN { \exp_not:n { \exp_not:N ^^@ } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_B:w} % An explicit begin-group token increases the balance, unless within a % |\c{...}| or |\u{...}| construction. Add the desired begin-group % character, using the standard \cs{if_false:} trick. We eventually % \texttt{e}-expand twice. The first time must yield a balanced token % list, and the second one gives the bare begin-group token. The % \cs{exp_after:wN} is not strictly needed, but is more consistent % with \pkg{l3tl-analysis}. % \begin{macrocode} \char_set_catcode_group_begin:N \^^@ \cs_new_protected:Npn \@@_replacement_c_B:w { \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero_int \int_incr:N \l_@@_balance_int \fi: \@@_replacement_char:nNN { \exp_not:n { \exp_after:wN ^^@ \if_false: } \fi: } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_C:w} % This is not quite catcode-related: when the user requests a % character with category \enquote{control sequence}, the % one-character control symbol is returned. As for the active % character, we prepare for two \texttt{e}-expansions. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c_C:w #1#2 { \tl_build_put_right:Nn \l_@@_build_tl { \exp_not:N \@@_replacement_exp_not:N \exp_not:c {#2} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_D:w} % Subscripts fit the mould: \tn{lowercase} the null byte with the % correct category. % \begin{macrocode} \char_set_catcode_math_subscript:N \^^@ \cs_new_protected:Npn \@@_replacement_c_D:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_E:w} % Similar to the begin-group case, the second \texttt{e}-expansion % produces the bare end-group token. % \begin{macrocode} \char_set_catcode_group_end:N \^^@ \cs_new_protected:Npn \@@_replacement_c_E:w { \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero_int \int_decr:N \l_@@_balance_int \fi: \@@_replacement_char:nNN { \exp_not:n { \if_false: { \fi: ^^@ } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_L:w} % Simply \tn{lowercase} a letter null byte to produce an arbitrary letter. % \begin{macrocode} \char_set_catcode_letter:N \^^@ \cs_new_protected:Npn \@@_replacement_c_L:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_M:w} % No surprise here, we lowercase the null math toggle. % \begin{macrocode} \char_set_catcode_math_toggle:N \^^@ \cs_new_protected:Npn \@@_replacement_c_M:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_O:w} % Lowercase an other null byte. % \begin{macrocode} \char_set_catcode_other:N \^^@ \cs_new_protected:Npn \@@_replacement_c_O:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_P:w} % For macro parameters, expansion is a tricky issue. We need to % prepare for two \texttt{e}-expansions and passing through various % macro definitions. Note that we cannot replace one \cs{exp_not:n} by % doubling the macro parameter characters because this would misbehave % if a mischievous user asks for |\c{\cP\#}|, since that macro % parameter character would be doubled. % \begin{macrocode} \char_set_catcode_parameter:N \^^@ \cs_new_protected:Npn \@@_replacement_c_P:w { \@@_replacement_char:nNN { \exp_not:n { \exp_not:n { ^^@^^@^^@^^@ } } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_S:w} % Spaces are normalized on input by \TeX{} to have character code % $32$. It is in fact impossible to get a token with character code % $0$ and category code $10$. Hence we use $32$ instead of $0$ as our % base character. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c_S:w #1#2 { \if_int_compare:w `#2 = \c_zero_int \msg_error:nn { regex } { replacement-null-space } \fi: \tex_lccode:D `\ = `#2 \scan_stop: \tex_lowercase:D { \@@_replacement_put:n {~} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_T:w} % No surprise for alignment tabs here. Those are surrounded by the % appropriate braces whenever necessary, hence they don't cause % trouble in alignment settings. % \begin{macrocode} \char_set_catcode_alignment:N \^^@ \cs_new_protected:Npn \@@_replacement_c_T:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replacement_c_U:w} % Simple call to \cs{@@_replacement_char:nNN} which lowercases the % math superscript |^^@|. % \begin{macrocode} \char_set_catcode_math_superscript:N \^^@ \cs_new_protected:Npn \@@_replacement_c_U:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % Restore the catcode of the null byte. % \begin{macrocode} \group_end: % \end{macrocode} % % \subsubsection{An error} % % \begin{macro}{\@@_replacement_error:NNN} % Simple error reporting by calling one of the messages % \texttt{replacement-c}, \texttt{replacement-g}, or % \texttt{replacement-u}. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_error:NNN #1#2#3 { \msg_error:nne { regex } { replacement-#1 } {#3} #2 #3 } % \end{macrocode} % \end{macro} % % \subsection{User functions} % % \begin{macro}{\regex_new:N} % Before being assigned a sensible value, a regex variable matches % nothing. % \begin{macrocode} \cs_new_protected:Npn \regex_new:N #1 { \cs_new_eq:NN #1 \c_@@_no_match_regex } % \end{macrocode} % \end{macro} % % \begin{variable}{\l_tmpa_regex, \l_tmpb_regex, \g_tmpa_regex, \g_tmpb_regex} % The usual scratch space. % \begin{macrocode} \regex_new:N \l_tmpa_regex \regex_new:N \l_tmpb_regex \regex_new:N \g_tmpa_regex \regex_new:N \g_tmpb_regex % \end{macrocode} % \end{variable} % % \begin{macro}{\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn} % Compile, then store the result in the user variable with the % appropriate assignment function. % \begin{macrocode} \cs_new_protected:Npn \regex_set:Nn #1#2 { \@@_compile:n {#2} \tl_set_eq:NN #1 \l_@@_internal_regex } \cs_new_protected:Npn \regex_gset:Nn #1#2 { \@@_compile:n {#2} \tl_gset_eq:NN #1 \l_@@_internal_regex } \cs_new_protected:Npn \regex_const:Nn #1#2 { \@@_compile:n {#2} \tl_const:Ne #1 { \exp_not:o \l_@@_internal_regex } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \regex_show:n, \regex_log:n, \@@_show:Nn, % \regex_show:N, \regex_log:N, \@@_show:NN % } % User functions: the \texttt{n} variant requires compilation first. % Then show the variable with some appropriate text. The auxiliary % \cs{@@_show:N} is defined in a different section. % \begin{macrocode} \cs_new_protected:Npn \regex_show:n { \@@_show:Nn \msg_show:nneeee } \cs_new_protected:Npn \regex_log:n { \@@_show:Nn \msg_log:nneeee } \cs_new_protected:Npn \@@_show:Nn #1#2 { \@@_compile:n {#2} \@@_show:N \l_@@_internal_regex #1 { regex } { show } { \tl_to_str:n {#2} } { } { \l_@@_internal_a_tl } { } } \cs_new_protected:Npn \regex_show:N { \@@_show:NN \msg_show:nneeee } \cs_new_protected:Npn \regex_log:N { \@@_show:NN \msg_log:nneeee } \cs_new_protected:Npn \@@_show:NN #1#2 { \__kernel_chk_tl_type:NnnT #2 { regex } { \exp_args:No \@@_clean_regex:n {#2} } { \@@_show:N #2 #1 { regex } { show } { } { \token_to_str:N #2 } { \l_@@_internal_a_tl } { } } } % \end{macrocode} % \end{macro} % % \begin{macro}[TF] % { % \regex_match:nn, \regex_match:nV, % \regex_match:Nn, \regex_match:NV % } % Those conditionals are based on a common auxiliary defined % later. Its first argument builds the \textsc{nfa} corresponding to % the regex, and the second argument is the query token list. Once we % have performed the match, convert the resulting boolean to % \cs{prg_return_true:} or \texttt{false}. % \begin{macrocode} \prg_new_protected_conditional:Npnn \regex_match:nn #1#2 { T , F , TF } { \@@_if_match:nn { \@@_build:n {#1} } {#2} \@@_return: } \prg_generate_conditional_variant:Nnn \regex_match:nn { nV } { T , F , TF } \prg_new_protected_conditional:Npnn \regex_match:Nn #1#2 { T , F , TF } { \@@_if_match:nn { \@@_build:N #1 } {#2} \@@_return: } \prg_generate_conditional_variant:Nnn \regex_match:Nn { NV } { T , F , TF } % \end{macrocode} % \end{macro} % % \begin{macro} % {\regex_count:nnN, \regex_count:nVN, \regex_count:NnN, \regex_count:NVN} % Again, use an auxiliary whose first argument builds the \textsc{nfa}. % \begin{macrocode} \cs_new_protected:Npn \regex_count:nnN #1 { \@@_count:nnN { \@@_build:n {#1} } } \cs_new_protected:Npn \regex_count:NnN #1 { \@@_count:nnN { \@@_build:N #1 } } \cs_generate_variant:Nn \regex_count:nnN { nV } \cs_generate_variant:Nn \regex_count:NnN { NV } % \end{macrocode} % \end{macro} % % \begin{macro}[noTF]{\regex_match_case:nn} % The auxiliary errors if |#1| has an odd number of items, and % otherwise it sets \cs{g_@@_case_int} according to which case was % found (zero if not found). The \texttt{true} branch leaves the % corresponding code in the input stream. % \begin{macrocode} \cs_new_protected:Npn \regex_match_case:nnTF #1#2#3 { \@@_match_case:nnTF {#1} {#2} { \tl_item:nn {#1} { 2 * \g_@@_case_int } #3 } } \cs_new_protected:Npn \regex_match_case:nn #1#2 { \regex_match_case:nnTF {#1} {#2} { } { } } \cs_new_protected:Npn \regex_match_case:nnT #1#2#3 { \regex_match_case:nnTF {#1} {#2} {#3} { } } \cs_new_protected:Npn \regex_match_case:nnF #1#2 { \regex_match_case:nnTF {#1} {#2} { } } % \end{macrocode} % \end{macro} % % \begin{macro}[noTF] % { % \regex_extract_once:nnN, \regex_extract_once:nVN, % \regex_extract_once:NnN, \regex_extract_once:NVN, % \regex_extract_all:nnN, \regex_extract_all:nVN, % \regex_extract_all:NnN, \regex_extract_all:NVN, % \regex_replace_once:nnN, \regex_replace_once:nVN, % \regex_replace_once:NnN, \regex_replace_once:NVN, % \regex_replace_all:nnN, \regex_replace_all:nVN, % \regex_replace_all:NnN, \regex_replace_all:NVN, % \regex_split:NnN, \regex_split:NVN, % \regex_split:nnN, \regex_split:nVN % } % We define here $40$ user functions, following a common pattern in % terms of \texttt{:nnN} auxiliaries, defined in the coming % subsections. The auxiliary is handed \cs{@@_build:n} or % \cs{@@_build:N} with the appropriate regex argument, then all % other necessary arguments (replacement text, token list, \emph{etc.} % The conditionals call \cs{@@_return:} to return either % \texttt{true} or \texttt{false} once matching has been performed. % \begin{macrocode} \cs_set_protected:Npn \@@_tmp:w #1#2#3 { \cs_new_protected:Npn #2 ##1 { #1 { \@@_build:n {##1} } } \cs_new_protected:Npn #3 ##1 { #1 { \@@_build:N ##1 } } \prg_new_protected_conditional:Npnn #2 ##1##2##3 { T , F , TF } { #1 { \@@_build:n {##1} } {##2} ##3 \@@_return: } \prg_new_protected_conditional:Npnn #3 ##1##2##3 { T , F , TF } { #1 { \@@_build:N ##1 } {##2} ##3 \@@_return: } \cs_generate_variant:Nn #2 { nV } \prg_generate_conditional_variant:Nnn #2 { nV } { T , F , TF } \cs_generate_variant:Nn #3 { NV } \prg_generate_conditional_variant:Nnn #3 { NV } { T , F , TF } } \@@_tmp:w \@@_extract_once:nnN \regex_extract_once:nnN \regex_extract_once:NnN \@@_tmp:w \@@_extract_all:nnN \regex_extract_all:nnN \regex_extract_all:NnN \@@_tmp:w \@@_replace_once:nnN \regex_replace_once:nnN \regex_replace_once:NnN \@@_tmp:w \@@_replace_all:nnN \regex_replace_all:nnN \regex_replace_all:NnN \@@_tmp:w \@@_split:nnN \regex_split:nnN \regex_split:NnN % \end{macrocode} % \end{macro} % % \begin{macro}[noTF]{\regex_replace_case_once:nN} % If the input is bad (odd number of items) then take the false % branch. Otherwise, use the same auxiliary as % \cs{regex_replace_once:nnN}, but with more complicated code to build % the automaton, and to find what replacement text to use. The % \cs{tl_item:nn} is only expanded once we know the value of % \cs{g_@@_case_int}, namely which case matched. % \begin{macrocode} \cs_new_protected:Npn \regex_replace_case_once:nNTF #1#2 { \int_if_odd:nTF { \tl_count:n {#1} } { \msg_error:nneeee { regex } { case-odd } { \token_to_str:N \regex_replace_case_once:nN(TF) } { code } { \tl_count:n {#1} } { \tl_to_str:n {#1} } \use_ii:nn } { \@@_replace_once_aux:nnN { \@@_case_build:e { \@@_tl_odd_items:n {#1} } } { \@@_replacement:e { \tl_item:nn {#1} { 2 * \g_@@_case_int } } } #2 \bool_if:NTF \g_@@_success_bool } } \cs_new_protected:Npn \regex_replace_case_once:nN #1#2 { \regex_replace_case_once:nNTF {#1} {#2} { } { } } \cs_new_protected:Npn \regex_replace_case_once:nNT #1#2#3 { \regex_replace_case_once:nNTF {#1} {#2} {#3} { } } \cs_new_protected:Npn \regex_replace_case_once:nNF #1#2 { \regex_replace_case_once:nNTF {#1} {#2} { } } % \end{macrocode} % \end{macro} % % \begin{macro}[noTF]{\regex_replace_case_all:nN} % If the input is bad (odd number of items) then take the false % branch. Otherwise, use the same auxiliary as % \cs{regex_replace_all:nnN}, but with more complicated code to build % the automaton, and to find what replacement text to use. % \begin{macrocode} \cs_new_protected:Npn \regex_replace_case_all:nNTF #1#2 { \int_if_odd:nTF { \tl_count:n {#1} } { \msg_error:nneeee { regex } { case-odd } { \token_to_str:N \regex_replace_case_all:nN(TF) } { code } { \tl_count:n {#1} } { \tl_to_str:n {#1} } \use_ii:nn } { \@@_replace_all_aux:nnN { \@@_case_build:e { \@@_tl_odd_items:n {#1} } } { \@@_case_replacement:e { \@@_tl_even_items:n {#1} } } #2 \bool_if:NTF \g_@@_success_bool } } \cs_new_protected:Npn \regex_replace_case_all:nN #1#2 { \regex_replace_case_all:nNTF {#1} {#2} { } { } } \cs_new_protected:Npn \regex_replace_case_all:nNT #1#2#3 { \regex_replace_case_all:nNTF {#1} {#2} {#3} { } } \cs_new_protected:Npn \regex_replace_case_all:nNF #1#2 { \regex_replace_case_all:nNTF {#1} {#2} { } } % \end{macrocode} % \end{macro} % % \subsubsection{Variables and helpers for user functions} % % \begin{variable}{\l_@@_match_count_int} % The number of matches found so far is stored % in \cs{l_@@_match_count_int}. This is only used % in the \cs{regex_count:nnN} functions. % \begin{macrocode} \int_new:N \l_@@_match_count_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_begin_flag, \l_@@_end_flag} % Those flags are raised to indicate begin-group or end-group tokens % that had to be added when extracting submatches. % \begin{macrocode} \flag_new:N \l_@@_begin_flag \flag_new:N \l_@@_end_flag % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_min_submatch_int, \l_@@_submatch_int, \l_@@_zeroth_submatch_int} % The end-points of each submatch are stored in two arrays whose index \meta{submatch} ranges % from \cs{l_@@_min_submatch_int} (inclusive) to % \cs{l_@@_submatch_int} (exclusive). Each successful match comes % with a $0$-th submatch (the full match), and one match for each % capturing group: submatches corresponding to the last successful % match are labelled starting at \texttt{zeroth_submatch}. The entry % \cs{l_@@_zeroth_submatch_int} in \cs{g_@@_submatch_prev_intarray} holds % the position at which that match attempt started: this is used for % splitting and replacements. % \begin{macrocode} \int_new:N \l_@@_min_submatch_int \int_new:N \l_@@_submatch_int \int_new:N \l_@@_zeroth_submatch_int % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_submatch_prev_intarray, \g_@@_submatch_begin_intarray, \g_@@_submatch_end_intarray, \g_@@_submatch_case_intarray} % Hold the place where the match attempt begun, the end-points of each % submatch, and which regex case the match corresponds to, respectively. % \begin{macrocode} \intarray_new:Nn \g_@@_submatch_prev_intarray { 65536 } \intarray_new:Nn \g_@@_submatch_begin_intarray { 65536 } \intarray_new:Nn \g_@@_submatch_end_intarray { 65536 } \intarray_new:Nn \g_@@_submatch_case_intarray { 65536 } % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_balance_intarray} % The first thing we do when matching is to store the balance of % begin-group/end-group characters into \cs{g_@@_balance_intarray}. % \begin{macrocode} \intarray_new:Nn \g_@@_balance_intarray { 65536 } % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_added_begin_int, \l_@@_added_end_int} % Keep track of the number of left/right braces to add when performing % a regex operation such as a replacement. % \begin{macrocode} \int_new:N \l_@@_added_begin_int \int_new:N \l_@@_added_end_int % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_return:} % This function triggers either \cs{prg_return_false:} or % \cs{prg_return_true:} as appropriate to whether a match was found or % not. It is used by all user conditionals. % \begin{macrocode} \cs_new_protected:Npn \@@_return: { \if_meaning:w \c_true_bool \g_@@_success_bool \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_query_set:n, \@@_query_set_aux:nN} % To easily extract subsets of the input once we found the positions % at which to cut, store the input tokens one by one into successive % \tn{toks} registers. Also store the brace balance (used to check % for overall brace balance) in an array. % \begin{macrocode} \cs_new_protected:Npn \@@_query_set:n #1 { \int_zero:N \l_@@_balance_int \int_zero:N \l_@@_curr_pos_int \@@_query_set_aux:nN { } F \tl_analysis_map_inline:nn {#1} { \@@_query_set_aux:nN {##1} ##3 } \@@_query_set_aux:nN { } F \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int } \cs_new_protected:Npn \@@_query_set_aux:nN #1#2 { \int_incr:N \l_@@_curr_pos_int \@@_toks_set:Nn \l_@@_curr_pos_int {#1} \__kernel_intarray_gset:Nnn \g_@@_balance_intarray \l_@@_curr_pos_int \l_@@_balance_int \if_case:w "#2 \exp_stop_f: \or: \int_incr:N \l_@@_balance_int \or: \int_decr:N \l_@@_balance_int \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Matching} % % \begin{macro}{\@@_if_match:nn} % We don't track submatches, and stop after a single match. Build the % \textsc{nfa} with |#1|, and perform the match on the query |#2|. % \begin{macrocode} \cs_new_protected:Npn \@@_if_match:nn #1#2 { \group_begin: \@@_disable_submatches: \@@_single_match: #1 \@@_match:n {#2} \group_end: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_match_case:nnTF} % \begin{macro}[EXP]{\@@_match_case_aux:nn} % The code would get badly messed up if the number of items in |#1| % were not even, so we catch this case, then follow the same code as % \cs{regex_match:nnTF} but using \cs{@@_case_build:n} and without % returning a result. % \begin{macrocode} \cs_new_protected:Npn \@@_match_case:nnTF #1#2 { \int_if_odd:nTF { \tl_count:n {#1} } { \msg_error:nneeee { regex } { case-odd } { \token_to_str:N \regex_match_case:nn(TF) } { code } { \tl_count:n {#1} } { \tl_to_str:n {#1} } \use_ii:nn } { \@@_if_match:nn { \@@_case_build:e { \@@_tl_odd_items:n {#1} } } {#2} \bool_if:NTF \g_@@_success_bool } } \cs_new:Npn \@@_match_case_aux:nn #1#2 { \exp_not:n { {#1} } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_count:nnN} % Again, we don't care about submatches. Instead of aborting after the % first \enquote{longest match} is found, we search for multiple % matches, incrementing \cs{l_@@_match_count_int} every time to % record the number of matches. Build the \textsc{nfa} and match. At % the end, store the result in the user's variable. % \begin{macrocode} \cs_new_protected:Npn \@@_count:nnN #1#2#3 { \group_begin: \@@_disable_submatches: \int_zero:N \l_@@_match_count_int \@@_multi_match:n { \int_incr:N \l_@@_match_count_int } #1 \@@_match:n {#2} \exp_args:NNNo \group_end: \int_set:Nn #3 { \int_use:N \l_@@_match_count_int } } % \end{macrocode} % \end{macro} % % \subsubsection{Extracting submatches} % % \begin{macro}{\@@_extract_once:nnN, \@@_extract_all:nnN} % Match once or multiple times. After each match (or after the only % match), extract the submatches using \cs{@@_extract:}. At the % end, store the sequence containing all the submatches into the user % variable |#3| after closing the group. % \begin{macrocode} \cs_new_protected:Npn \@@_extract_once:nnN #1#2#3 { \group_begin: \@@_single_match: #1 \@@_match:n {#2} \@@_extract: \@@_query_set:n {#2} \@@_group_end_extract_seq:N #3 } \cs_new_protected:Npn \@@_extract_all:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \@@_extract: } #1 \@@_match:n {#2} \@@_query_set:n {#2} \@@_group_end_extract_seq:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_split:nnN} % Splitting at submatches is a bit more tricky. For each match, % extract all submatches, and replace the zeroth submatch by the part % of the query between the start of the match attempt and the start of % the zeroth submatch. This is inhibited if the delimiter matched an % empty token list at the start of this match attempt. After the last % match, store the last part of the token list, which ranges from the % start of the match attempt to the end of the query. This step is % inhibited if the last match was empty and at the very end: decrement % \cs{l_@@_submatch_int}, which controls which matches will be used. % \begin{macrocode} \cs_new_protected:Npn \@@_split:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \if_int_compare:w \l_@@_start_pos_int < \l_@@_success_pos_int \@@_extract: \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray \l_@@_zeroth_submatch_int \c_zero_int \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray \l_@@_zeroth_submatch_int { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray \l_@@_zeroth_submatch_int } \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray \l_@@_zeroth_submatch_int \l_@@_start_pos_int \fi: } #1 \@@_match:n {#2} \@@_query_set:n {#2} \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray \l_@@_submatch_int \c_zero_int \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray \l_@@_submatch_int \l_@@_max_pos_int \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray \l_@@_submatch_int \l_@@_start_pos_int \int_incr:N \l_@@_submatch_int \if_meaning:w \c_true_bool \l_@@_empty_success_bool \if_int_compare:w \l_@@_start_pos_int = \l_@@_max_pos_int \int_decr:N \l_@@_submatch_int \fi: \fi: \@@_group_end_extract_seq:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_group_end_extract_seq:N} % \begin{macro}{\@@_extract_seq:N} % \begin{macro}{\@@_extract_seq:NNn} % \begin{macro}{\@@_extract_seq_loop:Nw} % The end-points of submatches are stored as entries of two arrays % from \cs{l_@@_min_submatch_int} to \cs{l_@@_submatch_int} % (exclusive). Extract the relevant ranges into \cs{g_@@_internal_tl}, % separated by \cs{@@_tmp:w} |{}|. We keep track in the two flags % \texttt{__regex_begin} and \texttt{__regex_end} of the number of % begin-group or end-group tokens added to make each of these items % overall balanced. At this step, |}{| is counted as being balanced % (same number of begin-group and end-group tokens). This problem is % caught by \cs{@@_extract_check:w}, explained later. After % complaining about any begin-group or end-group tokens we had to add, % we are ready to construct the user's sequence outside the group. % \begin{macrocode} \cs_new_protected:Npn \@@_group_end_extract_seq:N #1 { \flag_clear:N \l_@@_begin_flag \flag_clear:N \l_@@_end_flag \cs_set_eq:NN \@@_tmp:w \scan_stop: \__kernel_tl_gset:Nx \g_@@_internal_tl { \int_step_function:nnN \l_@@_min_submatch_int { \l_@@_submatch_int - \c_one_int } \@@_extract_seq_aux:n \@@_tmp:w } \int_set:Nn \l_@@_added_begin_int { \flag_height:N \l_@@_begin_flag } \int_set:Nn \l_@@_added_end_int { \flag_height:N \l_@@_end_flag } \tex_afterassignment:D \@@_extract_check:w \__kernel_tl_gset:Nx \g_@@_internal_tl { \g_@@_internal_tl \if_false: { \fi: } } \int_compare:nNnT { \l_@@_added_begin_int + \l_@@_added_end_int } > \c_zero_int { \msg_error:nneee { regex } { result-unbalanced } { splitting~or~extracting~submatches } { \int_use:N \l_@@_added_begin_int } { \int_use:N \l_@@_added_end_int } } \group_end: \@@_extract_seq:N #1 } \cs_gset_protected:Npn \@@_extract_seq:N #1 { \seq_clear:N #1 \cs_set_eq:NN \@@_tmp:w \@@_extract_seq_loop:Nw \exp_after:wN \@@_extract_seq:NNn \exp_after:wN #1 \g_@@_internal_tl \use_none:nnn } \cs_new_protected:Npn \@@_extract_seq:NNn #1#2#3 { #3 #2 #1 \prg_do_nothing: } \cs_new_protected:Npn \@@_extract_seq_loop:Nw #1#2 \@@_tmp:w #3 { \seq_put_right:No #1 {#2} #3 \@@_extract_seq_loop:Nw #1 \prg_do_nothing: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_extract_seq_aux:n, \@@_extract_seq_aux:ww} % The \texttt{:n} auxiliary builds one item of the sequence of % submatches. First compute the brace balance of the submatch, then % extract the submatch from the query, adding the appropriate braces % and raising a flag if the submatch is not balanced. % \begin{macrocode} \cs_new:Npn \@@_extract_seq_aux:n #1 { \@@_tmp:w { } \exp_after:wN \@@_extract_seq_aux:ww \int_value:w \@@_submatch_balance:n {#1} ; #1; } \cs_new:Npn \@@_extract_seq_aux:ww #1; #2; { \if_int_compare:w #1 < \c_zero_int \prg_replicate:nn {-#1} { \flag_raise:N \l_@@_begin_flag \exp_not:n { { \if_false: } \fi: } } \fi: \@@_query_submatch:n {#2} \if_int_compare:w #1 > \c_zero_int \prg_replicate:nn {#1} { \flag_raise:N \l_@@_end_flag \exp_not:n { \if_false: { \fi: } } } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_extract_check:w, \@@_extract_check:n, % \@@_extract_check_loop:w, \@@_extract_check_end:w % } % In \cs{@@_group_end_extract_seq:N} we had to expand % \cs{g_@@_internal_tl} to turn \cs{if_false:} constructions into % actual begin-group and end-group tokens. This is done with a % \cs{__kernel_tl_gset:Nx} assignment, and \cs{@@_extract_check:w} is % run immediately after this assignment ends, thanks to the % \tn{afterassignment} primitive. If all of the items were properly % balanced (enough begin-group tokens before end-group tokens, so |}{| % is not) then \cs{@@_extract_check:w} is called just before the % closing brace of the \cs{__kernel_tl_gset:Nx} (thanks to our sneaky % \cs{if_false:} |{| \cs{fi:} |}| construction), and finds that there % is nothing left to expand. If any of the items is unbalanced, the % assignment gets ended early by an extra end-group token, and our % check finds more tokens needing to be expanded in a new % \cs{__kernel_tl_gset:Nx} assignment. We need to add a begin-group % and an end-group tokens to the unbalanced item, namely to the last % item found so far, which we reach through a loop. % \begin{macrocode} \cs_new_protected:Npn \@@_extract_check:w { \exp_after:wN \@@_extract_check:n \exp_after:wN { \if_false: } \fi: } \cs_new_protected:Npn \@@_extract_check:n #1 { \tl_if_empty:nF {#1} { \int_incr:N \l_@@_added_begin_int \int_incr:N \l_@@_added_end_int \tex_afterassignment:D \@@_extract_check:w \__kernel_tl_gset:Nx \g_@@_internal_tl { \exp_after:wN \@@_extract_check_loop:w \g_@@_internal_tl \@@_tmp:w \@@_extract_check_end:w #1 } } } \cs_new:Npn \@@_extract_check_loop:w #1 \@@_tmp:w #2 { #2 \exp_not:o {#1} \@@_tmp:w { } \@@_extract_check_loop:w \prg_do_nothing: } % \end{macrocode} % Arguments of \cs{@@_extract_check_end:w} are: |#1| is the part of % the item before the extra end-group token; |#2| is junk; |#3| is % \cs{prg_do_nothing:} followed by the not-yet-expanded part of the % item after the extra end-group token. In the replacement text, the % first brace and the \cs{if_false:} |{| \cs{fi:} |}| construction are % the added begin-group and end-group tokens (the latter being not-yet % expanded, just like~|#3|), while the closing brace after % \cs{exp_not:o} |{#1}| replaces the extra end-group token that had % ended the assignment early. In particular this means that the % character code of that end-group token is lost. % \begin{macrocode} \cs_new:Npn \@@_extract_check_end:w \exp_not:o #1#2 \@@_extract_check_loop:w #3 \@@_tmp:w { { \exp_not:o {#1} } #3 \if_false: { \fi: } \@@_tmp:w } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_extract:, \@@_extract_aux:w} % Our task here is to store the list of end-points of submatches, and % store them in appropriate array entries, from % \cs{l_@@_zeroth_submatch_int} upwards. First, we store in % \cs{g_@@_submatch_prev_intarray} the position at which the match % attempt started. We extract the rest from the comma list % \cs{l_@@_success_submatches_tl}, which starts with entries to be % stored in \cs{g_@@_submatch_begin_intarray} and continues with % entries for \cs{g_@@_submatch_end_intarray}. % \begin{macrocode} \cs_new_protected:Npn \@@_extract: { \if_meaning:w \c_true_bool \g_@@_success_bool \int_set_eq:NN \l_@@_zeroth_submatch_int \l_@@_submatch_int \prg_replicate:nn \l_@@_capturing_group_int { \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray \l_@@_submatch_int \c_zero_int \__kernel_intarray_gset:Nnn \g_@@_submatch_case_intarray \l_@@_submatch_int \c_zero_int \int_incr:N \l_@@_submatch_int } \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray \l_@@_zeroth_submatch_int \l_@@_start_pos_int \__kernel_intarray_gset:Nnn \g_@@_submatch_case_intarray \l_@@_zeroth_submatch_int \g_@@_case_int \int_zero:N \l_@@_internal_a_int \exp_after:wN \@@_extract_aux:w \l_@@_success_submatches_tl \prg_break_point: \@@_use_none_delimit_by_q_recursion_stop:w , \q_@@_recursion_stop \fi: } \cs_new_protected:Npn \@@_extract_aux:w #1 , { \prg_break: #1 \prg_break_point: \if_int_compare:w \l_@@_internal_a_int < \l_@@_capturing_group_int \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray { \@@_int_eval:w \l_@@_zeroth_submatch_int + \l_@@_internal_a_int } {#1} \else: \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray { \@@_int_eval:w \l_@@_zeroth_submatch_int + \l_@@_internal_a_int - \l_@@_capturing_group_int } {#1} \fi: \int_incr:N \l_@@_internal_a_int \@@_extract_aux:w } % \end{macrocode} % \end{macro} % % \subsubsection{Replacement} % % \begin{macro}{\@@_replace_once:nnN, \@@_replace_once_aux:nnN} % Build the \textsc{nfa} and the replacement functions, then find a % single match. If the match failed, simply exit the % group. Otherwise, we do the replacement. Extract submatches. Compute % the brace balance corresponding to replacing this match by the % replacement (this depends on submatches). Prepare the replaced token % list: the replacement function produces the tokens from the start of % the query to the start of the match and the replacement text for % this match; we need to add the tokens from the end of the match to % the end of the query. Finally, store the result in the user's % variable after closing the group: this step involves an additional % \texttt{e}-expansion, and checks that braces are balanced in the % final result. % \begin{macrocode} \cs_new_protected:Npn \@@_replace_once:nnN #1#2 { \@@_replace_once_aux:nnN {#1} { \@@_replacement:n {#2} } } \cs_new_protected:Npn \@@_replace_once_aux:nnN #1#2#3 { \group_begin: \@@_single_match: #1 \exp_args:No \@@_match:n {#3} \bool_if:NTF \g_@@_success_bool { \@@_extract: \exp_args:No \@@_query_set:n {#3} #2 \int_set:Nn \l_@@_balance_int { \@@_replacement_balance_one_match:n \l_@@_zeroth_submatch_int } \__kernel_tl_set:Nx \l_@@_internal_a_tl { \@@_replacement_do_one_match:n \l_@@_zeroth_submatch_int \@@_query_range:nn { \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray \l_@@_zeroth_submatch_int } \l_@@_max_pos_int } \@@_group_end_replace:N #3 } { \group_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_replace_all:nnN} % Match multiple times, and for every match, extract submatches and % additionally store the position at which the match attempt started. % The entries from \cs{l_@@_min_submatch_int} to % \cs{l_@@_submatch_int} hold information about submatches of every % match in order; each match corresponds to % \cs{l_@@_capturing_group_int} consecutive entries. % Compute the brace balance corresponding to doing all the % replacements: this is the sum of brace balances for replacing each % match. Join together the replacement texts for each match (including % the part of the query before the match), and the end of the query. % \begin{macrocode} \cs_new_protected:Npn \@@_replace_all:nnN #1#2 { \@@_replace_all_aux:nnN {#1} { \@@_replacement:n {#2} } } \cs_new_protected:Npn \@@_replace_all_aux:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \@@_extract: } #1 \exp_args:No \@@_match:n {#3} \exp_args:No \@@_query_set:n {#3} #2 \int_set:Nn \l_@@_balance_int { \c_zero_int \int_step_function:nnnN \l_@@_min_submatch_int \l_@@_capturing_group_int { \l_@@_submatch_int - \c_one_int } \@@_replacement_balance_one_match:n } \__kernel_tl_set:Nx \l_@@_internal_a_tl { \int_step_function:nnnN \l_@@_min_submatch_int \l_@@_capturing_group_int { \l_@@_submatch_int - \c_one_int } \@@_replacement_do_one_match:n \@@_query_range:nn \l_@@_start_pos_int \l_@@_max_pos_int } \@@_group_end_replace:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \@@_group_end_replace:N, \@@_group_end_replace_try:, % \@@_group_end_replace_check:w, \@@_group_end_replace_check:n % } % At this stage \cs{l_@@_internal_a_tl} (|e|-expands to the desired % result). Guess from \cs{l_@@_balance_int} the number of braces to % add before or after the result then try expanding. The simplest % case is when \cs{l_@@_internal_a_tl} together with the braces we % insert via \cs{prg_replicate:nn} give a balanced result, and the % assignment ends at the \cs{if_false:} |{| \cs{fi:} |}| construction: % then \cs{@@_group_end_replace_check:w} sees that there is no % material left and we successfully found the result. The harder case % is that expanding \cs{l_@@_internal_a_tl} may produce extra closing % braces and end the assignment early. Then we grab the remaining code % using; importantly, what follows has not yet been expanded so that % \cs{@@_group_end_replace_check:n} grabs everything until the last % brace in \cs{@@_group_end_replace_try:}, letting us try again with % an extra surrounding pair of braces. % \begin{macrocode} \cs_new_protected:Npn \@@_group_end_replace:N #1 { \int_set:Nn \l_@@_added_begin_int { \int_max:nn { - \l_@@_balance_int } \c_zero_int } \int_set:Nn \l_@@_added_end_int { \int_max:nn \l_@@_balance_int \c_zero_int } \@@_group_end_replace_try: \int_compare:nNnT { \l_@@_added_begin_int + \l_@@_added_end_int } > \c_zero_int { \msg_error:nneee { regex } { result-unbalanced } { replacing } { \int_use:N \l_@@_added_begin_int } { \int_use:N \l_@@_added_end_int } } \group_end: \tl_set_eq:NN #1 \g_@@_internal_tl } \cs_new_protected:Npn \@@_group_end_replace_try: { \tex_afterassignment:D \@@_group_end_replace_check:w \__kernel_tl_gset:Nx \g_@@_internal_tl { \prg_replicate:nn \l_@@_added_begin_int { { \if_false: } \fi: } \l_@@_internal_a_tl \prg_replicate:nn \l_@@_added_end_int { \if_false: { \fi: } } \if_false: { \fi: } } } \cs_new_protected:Npn \@@_group_end_replace_check:w { \exp_after:wN \@@_group_end_replace_check:n \exp_after:wN { \if_false: } \fi: } \cs_new_protected:Npn \@@_group_end_replace_check:n #1 { \tl_if_empty:nF {#1} { \int_incr:N \l_@@_added_begin_int \int_incr:N \l_@@_added_end_int \@@_group_end_replace_try: } } % \end{macrocode} % \end{macro} % % \subsubsection{Peeking ahead} % % \begin{variable}{\l_@@_peek_true_tl, \l_@@_peek_false_tl} % True/false code arguments of \cs{peek_regex:nTF} or similar. % \begin{macrocode} \tl_new:N \l_@@_peek_true_tl \tl_new:N \l_@@_peek_false_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_replacement_tl} % When peeking in \cs{peek_regex_replace_once:nnTF} we need to store % the replacement text. % \begin{macrocode} \tl_new:N \l_@@_replacement_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_input_tl} % \begin{macro}{\@@_input_item:n} % Stores each token found as \cs{@@_input_item:n} \Arg{tokens}, where % the \meta{tokens} \texttt{o}-expand to the token found, as for % \cs{tl_analysis_map_inline:nn}. % \begin{macrocode} \tl_new:N \l_@@_input_tl \cs_new_eq:NN \@@_input_item:n ? % \end{macrocode} % \end{macro} % \end{variable} % % \begin{macro}[TF] % {\peek_regex:n, \peek_regex:N, \peek_regex_remove_once:n, \peek_regex_remove_once:N} % The |T| and |F| functions just call the corresponding |TF| function. % The four |TF| functions differ along two axes: whether to remove the % token or not, distinguished by using \cs{@@_peek_end:} or % \cs{@@_peek_remove_end:n} (the latter case needs an argument, as we % will see), and whether the regex has to be compiled or is already in % an |N|-type variable, distinguished by calling \cs{@@_build_aux:Nn} % or \cs{@@_build_aux:NN}. The first argument of these functions is % \cs{c_false_bool} to indicate that there should be no implicit % insertion of a wildcard at the start of the pattern: otherwise the % code would keep looking further into the input stream until matching % the regex. % \begin{macrocode} \cs_new_protected:Npn \peek_regex:nTF #1 { \@@_peek:nnTF { \@@_build_aux:Nn \c_false_bool {#1} } { \@@_peek_end: } } \cs_new_protected:Npn \peek_regex:nT #1#2 { \peek_regex:nTF {#1} {#2} { } } \cs_new_protected:Npn \peek_regex:nF #1 { \peek_regex:nTF {#1} { } } \cs_new_protected:Npn \peek_regex:NTF #1 { \@@_peek:nnTF { \@@_build_aux:NN \c_false_bool #1 } { \@@_peek_end: } } \cs_new_protected:Npn \peek_regex:NT #1#2 { \peek_regex:NTF #1 {#2} { } } \cs_new_protected:Npn \peek_regex:NF #1 { \peek_regex:NTF {#1} { } } \cs_new_protected:Npn \peek_regex_remove_once:nTF #1 { \@@_peek:nnTF { \@@_build_aux:Nn \c_false_bool {#1} } { \@@_peek_remove_end:n {##1} } } \cs_new_protected:Npn \peek_regex_remove_once:nT #1#2 { \peek_regex_remove_once:nTF {#1} {#2} { } } \cs_new_protected:Npn \peek_regex_remove_once:nF #1 { \peek_regex_remove_once:nTF {#1} { } } \cs_new_protected:Npn \peek_regex_remove_once:NTF #1 { \@@_peek:nnTF { \@@_build_aux:NN \c_false_bool #1 } { \@@_peek_remove_end:n {##1} } } \cs_new_protected:Npn \peek_regex_remove_once:NT #1#2 { \peek_regex_remove_once:NTF #1 {#2} { } } \cs_new_protected:Npn \peek_regex_remove_once:NF #1 { \peek_regex_remove_once:NTF #1 { } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek:nnTF, \@@_peek_aux:nnTF} % Store the user's true/false codes (plus \cs{group_end:}) into two % token lists. Then build the automaton with |#1|, without submatch % tracking, and aiming for a single match. Then start matching by % setting up a few variables like for any regex matching like % \cs{regex_match:nnTF}, with the addition of \cs{l_@@_input_tl} % that keeps track of the tokens seen, to reinsert them at the % end. Instead of \cs{tl_analysis_map_inline:nn} on the input, we % call \cs{peek_analysis_map_inline:n} to go through tokens in the % input stream. Since \cs{@@_match_one_token:nnN} calls % \cs{@@_maplike_break:} we need to catch that and break the % \cs{peek_analysis_map_inline:n} loop instead. % \begin{macrocode} \cs_new_protected:Npn \@@_peek:nnTF #1 { \@@_peek_aux:nnTF { \@@_disable_submatches: #1 } } \cs_new_protected:Npn \@@_peek_aux:nnTF #1#2#3#4 { \group_begin: \tl_set:Nn \l_@@_peek_true_tl { \group_end: #3 } \tl_set:Nn \l_@@_peek_false_tl { \group_end: #4 } \@@_single_match: #1 \@@_match_init: \tl_build_begin:N \l_@@_input_tl \@@_match_once_init: \peek_analysis_map_inline:n { \tl_build_put_right:Nn \l_@@_input_tl { \@@_input_item:n {##1} } \@@_match_one_token:nnN {##1} {##2} ##3 \use_none:nnn \prg_break_point:Nn \@@_maplike_break: { \peek_analysis_map_break:n {#2} } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_end:, \@@_peek_remove_end:n} % Once the regex matches (or permanently fails to match) we call % \cs{@@_peek_end:}, or \cs{@@_peek_remove_end:n} with argument the % last token seen. For \cs{peek_regex:nTF} we reinsert tokens seen by % calling \cs{@@_peek_reinsert:N} regardless of the result of the % match. For \cs{peek_regex_remove_once:nTF} we reinsert the tokens % seen only if the match failed; otherwise we just reinsert the % tokens~|#1|, with one expansion. To be more precise, |#1| consists % of tokens that \texttt{o}-expand and \texttt{e}-expand to the last % token seen, for example it is \cs{exp_not:N} \meta{cs} for a control % sequence. This means that just doing \cs{exp_after:wN} % \cs{l_@@_peek_true_tl} |#1| would be unsafe because the expansion of % \meta{cs} would be suppressed. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_end: { \bool_if:NTF \g_@@_success_bool { \@@_peek_reinsert:N \l_@@_peek_true_tl } { \@@_peek_reinsert:N \l_@@_peek_false_tl } } \cs_new_protected:Npn \@@_peek_remove_end:n #1 { \bool_if:NTF \g_@@_success_bool { \exp_args:NNo \use:nn \l_@@_peek_true_tl {#1} } { \@@_peek_reinsert:N \l_@@_peek_false_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_reinsert:N, \@@_reinsert_item:n} % Insert the true/false code |#1|, followed by the tokens found, which % were stored in \cs{l_@@_input_tl}. For this, loop through that % token list using \cs{@@_reinsert_item:n}, which expands |#1| once to % get a single token, and jumps over it to expand what follows, with % suitable \cs{exp:w} and \cs{exp_end:}. We cannot just use % \cs{use:e} on the whole token list because the result may be % unbalanced, which would stop the primitive prematurely, or let it % continue beyond where we would like. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_reinsert:N #1 { \tl_build_end:N \l_@@_input_tl \cs_set_eq:NN \@@_input_item:n \@@_reinsert_item:n \exp_after:wN #1 \exp:w \l_@@_input_tl \exp_end: } \cs_new_protected:Npn \@@_reinsert_item:n #1 { \exp_after:wN \exp_after:wN \exp_after:wN \exp_end: \exp_after:wN \exp_after:wN #1 \exp:w } % \end{macrocode} % \end{macro} % % \begin{macro}[noTF] % {\peek_regex_replace_once:nn, \peek_regex_replace_once:Nn} % Similar to \cs{peek_regex:nTF} above. % \begin{macrocode} \cs_new_protected:Npn \peek_regex_replace_once:nnTF #1 { \@@_peek_replace:nnTF { \@@_build_aux:Nn \c_false_bool {#1} } } \cs_new_protected:Npn \peek_regex_replace_once:nnT #1#2#3 { \peek_regex_replace_once:nnTF {#1} {#2} {#3} { } } \cs_new_protected:Npn \peek_regex_replace_once:nnF #1#2 { \peek_regex_replace_once:nnTF {#1} {#2} { } } \cs_new_protected:Npn \peek_regex_replace_once:nn #1#2 { \peek_regex_replace_once:nnTF {#1} {#2} { } { } } \cs_new_protected:Npn \peek_regex_replace_once:NnTF #1 { \@@_peek_replace:nnTF { \@@_build_aux:NN \c_false_bool #1 } } \cs_new_protected:Npn \peek_regex_replace_once:NnT #1#2#3 { \peek_regex_replace_once:NnTF #1 {#2} {#3} { } } \cs_new_protected:Npn \peek_regex_replace_once:NnF #1#2 { \peek_regex_replace_once:NnTF #1 {#2} { } } \cs_new_protected:Npn \peek_regex_replace_once:Nn #1#2 { \peek_regex_replace_once:NnTF #1 {#2} { } { } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replace:nnTF} % Same as \cs{@@_peek:nnTF} (used for \cs{peek_regex:nTF} above), but % without disabling submatches, and with a different end. The % replacement text |#2| is stored, to be analyzed later. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replace:nnTF #1#2 { \tl_set:Nn \l_@@_replacement_tl {#2} \@@_peek_aux:nnTF {#1} { \@@_peek_replace_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replace_end:} % If the match failed \cs{@@_peek_reinsert:N} reinserts the tokens % found. Otherwise, finish storing the submatch information using % \cs{@@_extract:}, and store the input into \tn{toks}. Redefine a % few auxiliaries to change slightly their expansion behaviour as % explained below. Analyse the replacement text with % \cs{@@_replacement:n}, which as usual defines % \cs{@@_replacement_do_one_match:n} to insert the tokens from the % start of the match attempt to the beginning of the match, followed % by the replacement text. The \cs{use:e} expands for instance the % trailing \cs{@@_query_range:nn} down to a sequence of % \cs{@@_reinsert_item:n} \Arg{tokens} where \meta{tokens} % \texttt{o}-expand to a single token that we want to insert. After % \texttt{e}-expansion, \cs{use:e} does \cs{use:n}, so we have % \cs{exp_after:wN} \cs{l_@@_peek_true_tl} \cs{exp:w} \ldots{} % \cs{exp_end:}. This is set up such as to obtain % \cs{l_@@_peek_true_tl} followed by the replaced tokens (possibly % unbalanced) in the input stream. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replace_end: { \bool_if:NTF \g_@@_success_bool { \@@_extract: \@@_query_set_from_input_tl: \cs_set_eq:NN \@@_replacement_put:n \@@_peek_replacement_put:n \cs_set_eq:NN \@@_replacement_put_submatch_aux:n \@@_peek_replacement_put_submatch_aux:n \cs_set_eq:NN \@@_input_item:n \@@_reinsert_item:n \cs_set_eq:NN \@@_replacement_exp_not:N \@@_peek_replacement_token:n \cs_set_eq:NN \@@_replacement_exp_not:V \@@_peek_replacement_var:N \exp_args:No \@@_replacement:n { \l_@@_replacement_tl } \use:e { \exp_not:n { \exp_after:wN \l_@@_peek_true_tl \exp:w } \@@_replacement_do_one_match:n \l_@@_zeroth_submatch_int \@@_query_range:nn { \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray \l_@@_zeroth_submatch_int } \l_@@_max_pos_int \exp_end: } } { \@@_peek_reinsert:N \l_@@_peek_false_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_query_set_from_input_tl:, \@@_query_set_item:n} % The input was stored into \cs{l_@@_input_tl} as successive items % \cs{@@_input_item:n} \Arg{tokens}. Store that in successive % \tn{toks}. It's not clear whether the empty entries before and % after are both useful. % \begin{macrocode} \cs_new_protected:Npn \@@_query_set_from_input_tl: { \tl_build_end:N \l_@@_input_tl \int_zero:N \l_@@_curr_pos_int \cs_set_eq:NN \@@_input_item:n \@@_query_set_item:n \@@_query_set_item:n { } \l_@@_input_tl \@@_query_set_item:n { } \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int } \cs_new_protected:Npn \@@_query_set_item:n #1 { \int_incr:N \l_@@_curr_pos_int \@@_toks_set:Nn \l_@@_curr_pos_int { \@@_input_item:n {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replacement_put:n} % While building the replacement function % \cs{@@_replacement_do_one_match:n}, we often want to put simple % material, given as |#1|, whose \texttt{e}-expansion % \texttt{o}-expands to a single token. Normally we can just add the % token to \cs{l_@@_build_tl}, but for % \cs{peek_regex_replace_once:nnTF} we eventually want to do some % strange expansion that is basically using \cs{exp_after:wN} to jump % through numerous tokens (we cannot use \texttt{e}-expansion like for % \cs{regex_replace_once:nnNTF} because it is ok for the result to be % unbalanced since we insert it in the input stream rather than % storing it. When within a csname we don't do any such shenanigan % because \cs{cs:w} \ldots{} \cs{cs_end:} does all the expansion we % need. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replacement_put:n #1 { \if_case:w \l_@@_replacement_csnames_int \tl_build_put_right:Nn \l_@@_build_tl { \exp_not:N \@@_reinsert_item:n {#1} } \else: \tl_build_put_right:Nn \l_@@_build_tl {#1} \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replacement_token:n} % When hit with \cs{exp:w}, \cs{@@_peek_replacement_token:n} % \Arg{token} stops \cs{exp_end:} and does \cs{exp_after:wN} % \meta{token} \cs{exp:w} to continue expansion after it. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replacement_token:n #1 { \exp_after:wN \exp_end: \exp_after:wN #1 \exp:w } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replacement_put_submatch_aux:n} % While analyzing the replacement we also have to insert submatches % found in the query. Since query items \cs{@@_input_item:n} % \Arg{tokens} expand correctly only when surrounded by \cs{exp:w} % \ldots{} \cs{exp_end:}, and since these expansion controls are not % there within csnames (because \cs{cs:w} \ldots{} \cs{cs_end:} make % them unnecessary in most cases), we have to put \cs{exp:w} and % \cs{exp_end:} by hand here. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replacement_put_submatch_aux:n #1 { \if_case:w \l_@@_replacement_csnames_int \tl_build_put_right:Nn \l_@@_build_tl { \@@_query_submatch:n { \@@_int_eval:w #1 + ##1 \scan_stop: } } \else: \tl_build_put_right:Nn \l_@@_build_tl { \exp:w \@@_query_submatch:n { \@@_int_eval:w #1 + ##1 \scan_stop: } \exp_end: } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_peek_replacement_var:N} % This is used for |\u| outside csnames. It makes sure to continue % expansion with \cs{exp:w} before expanding the variable~|#1| and % stopping the \cs{exp:w} that precedes. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_replacement_var:N #1 { \exp_after:wN \exp_last_unbraced:NV \exp_after:wN \exp_end: \exp_after:wN #1 \exp:w } % \end{macrocode} % \end{macro} % % \subsection{Messages} % % Messages for the preparsing phase. % \begin{macrocode} \use:e { \msg_new:nnn { regex } { trailing-backslash } { Trailing~'\iow_char:N\\'~in~regex~or~replacement. } \msg_new:nnn { regex } { x-missing-rbrace } { Missing~brace~'\iow_char:N\}'~in~regex~ '...\iow_char:N\\x\iow_char:N\{...##1'. } \msg_new:nnn { regex } { x-overflow } { Character~code~##1~too~large~in~ \iow_char:N\\x\iow_char:N\{##2\iow_char:N\}~regex. } } % \end{macrocode} % % Invalid quantifier. % \begin{macrocode} \msg_new:nnnn { regex } { invalid-quantifier } { Braced~quantifier~'#1'~may~not~be~followed~by~'#2'. } { The~character~'#2'~is~invalid~in~the~braced~quantifier~'#1'.~ The~only~valid~quantifiers~are~'*',~'?',~'+',~'{}',~ '{,}'~and~'{,}',~optionally~followed~by~'?'. } % \end{macrocode} % % Messages for missing or extra closing brackets and parentheses, with % some fancy singular/plural handling for the case of parentheses. % \begin{macrocode} \msg_new:nnnn { regex } { missing-rbrack } { Missing~right~bracket~inserted~in~regular~expression. } { LaTeX~was~given~a~regular~expression~where~a~character~class~ was~started~with~'[',~but~the~matching~']'~is~missing. } \msg_new:nnnn { regex } { missing-rparen } { Missing~right~ \int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } ~ inserted~in~regular~expression. } { LaTeX~was~given~a~regular~expression~with~\int_eval:n {#1} ~ more~left~parentheses~than~right~parentheses. } \msg_new:nnnn { regex } { extra-rparen } { Extra~right~parenthesis~ignored~in~regular~expression. } { LaTeX~came~across~a~closing~parenthesis~when~no~submatch~group~ was~open.~The~parenthesis~will~be~ignored. } % \end{macrocode} % % Some escaped alphanumerics are not allowed everywhere. % \begin{macrocode} \msg_new:nnnn { regex } { bad-escape } { Invalid~escape~'\iow_char:N\\#1'~ \@@_if_in_cs:TF { within~a~control~sequence. } { \@@_if_in_class:TF { in~a~character~class. } { following~a~category~test. } } } { The~escape~sequence~'\iow_char:N\\#1'~may~not~appear~ \@@_if_in_cs:TF { within~a~control~sequence~test~introduced~by~ '\iow_char:N\\c\iow_char:N\{'. } { \@@_if_in_class:TF { within~a~character~class~ } { following~a~category~test~such~as~'\iow_char:N\\cL'~ } because~it~does~not~match~exactly~one~character. } } % \end{macrocode} % % Range errors. % \begin{macrocode} \msg_new:nnnn { regex } { range-missing-end } { Invalid~end-point~for~range~'#1-#2'~in~character~class. } { The~end-point~'#2'~of~the~range~'#1-#2'~may~not~serve~as~an~ end-point~for~a~range:~alphanumeric~characters~should~not~be~ escaped,~and~non-alphanumeric~characters~should~be~escaped. } \msg_new:nnnn { regex } { range-backwards } { Range~'[#1-#2]'~out~of~order~in~character~class. } { In~ranges~of~characters~'[x-y]'~appearing~in~character~classes,~ the~first~character~code~must~not~be~larger~than~the~second.~ Here,~'#1'~has~character~code~\int_eval:n {`#1},~while~ '#2'~has~character~code~\int_eval:n {`#2}. } % \end{macrocode} % % Errors related to |\c| and |\u|. % \begin{macrocode} \msg_new:nnnn { regex } { c-bad-mode } { Invalid~nested~'\iow_char:N\\c'~escape~in~regular~expression. } { The~'\iow_char:N\\c'~escape~cannot~be~used~within~ a~control~sequence~test~'\iow_char:N\\c{...}'~ nor~another~category~test.~ To~combine~several~category~tests,~use~'\iow_char:N\\c[...]'. } \msg_new:nnnn { regex } { c-C-invalid } { '\iow_char:N\\cC'~should~be~followed~by~'.'~or~'(',~not~'#1'. } { The~'\iow_char:N\\cC'~construction~restricts~the~next~item~to~be~a~ control~sequence~or~the~next~group~to~be~made~of~control~sequences.~ It~only~makes~sense~to~follow~it~by~'.'~or~by~a~group. } \msg_new:nnnn { regex } { cu-lbrace } { Left~braces~must~be~escaped~in~'\iow_char:N\\#1{...}'. } { Constructions~such~as~'\iow_char:N\\#1{...\iow_char:N\{...}'~are~ not~allowed~and~should~be~replaced~by~ '\iow_char:N\\#1{...\token_to_str:N\{...}'. } \msg_new:nnnn { regex } { c-lparen-in-class } { Catcode~test~cannot~apply~to~group~in~character~class } { Construction~such~as~'\iow_char:N\\cL(abc)'~are~not~allowed~inside~a~ class~'[...]'~because~classes~do~not~match~multiple~characters~at~once. } \msg_new:nnnn { regex } { c-missing-rbrace } { Missing~right~brace~inserted~for~'\iow_char:N\\c'~escape. } { LaTeX~was~given~a~regular~expression~where~a~ '\iow_char:N\\c\iow_char:N\{...'~construction~was~not~ended~ with~a~closing~brace~'\iow_char:N\}'. } \msg_new:nnnn { regex } { c-missing-rbrack } { Missing~right~bracket~inserted~for~'\iow_char:N\\c'~escape. } { A~construction~'\iow_char:N\\c[...'~appears~in~a~ regular~expression,~but~the~closing~']'~is~not~present. } \msg_new:nnnn { regex } { c-missing-category } { Invalid~character~'#1'~following~'\iow_char:N\\c'~escape. } { In~regular~expressions,~the~'\iow_char:N\\c'~escape~sequence~ may~only~be~followed~by~a~left~brace,~a~left~bracket,~or~a~ capital~letter~representing~a~character~category,~namely~ one~of~'ABCDELMOPSTU'. } \msg_new:nnnn { regex } { c-trailing } { Trailing~category~code~escape~'\iow_char:N\\c'... } { A~regular~expression~ends~with~'\iow_char:N\\c'~followed~ by~a~letter.~It~will~be~ignored. } \msg_new:nnnn { regex } { u-missing-lbrace } { Missing~left~brace~following~'\iow_char:N\\u'~escape. } { The~'\iow_char:N\\u'~escape~sequence~must~be~followed~by~ a~brace~group~with~the~name~of~the~variable~to~use. } \msg_new:nnnn { regex } { u-missing-rbrace } { Missing~right~brace~inserted~for~'\iow_char:N\\u'~escape. } { LaTeX~ \str_if_eq:eeTF { } {#2} { reached~the~end~of~the~string~ } { encountered~an~escaped~alphanumeric~character '\iow_char:N\\#2'~ } when~parsing~the~argument~of~an~ '\iow_char:N\\u\iow_char:N\{...\}'~escape. } % \end{macrocode} % % Errors when encountering the \textsc{posix} syntax |[:...:]|. % \begin{macrocode} \msg_new:nnnn { regex } { posix-unsupported } { POSIX~collating~element~'[#1 ~ #1]'~not~supported. } { The~'[.foo.]'~and~'[=bar=]'~syntaxes~have~a~special~meaning~ in~POSIX~regular~expressions.~This~is~not~supported~by~LaTeX.~ Maybe~you~forgot~to~escape~a~left~bracket~in~a~character~class? } \msg_new:nnnn { regex } { posix-unknown } { POSIX~class~'[:#1:]'~unknown. } { '[:#1:]'~is~not~among~the~known~POSIX~classes~ '[:alnum:]',~'[:alpha:]',~'[:ascii:]',~'[:blank:]',~ '[:cntrl:]',~'[:digit:]',~'[:graph:]',~'[:lower:]',~ '[:print:]',~'[:punct:]',~'[:space:]',~'[:upper:]',~ '[:word:]',~and~'[:xdigit:]'. } \msg_new:nnnn { regex } { posix-missing-close } { Missing~closing~':]'~for~POSIX~class. } { The~POSIX~syntax~'#1'~must~be~followed~by~':]',~not~'#2'. } % \end{macrocode} % % In various cases, the result of a \pkg{l3regex} operation can leave us % with an unbalanced token list, which we must re-balance by adding % begin-group or end-group character tokens. % \begin{macrocode} \msg_new:nnnn { regex } { result-unbalanced } { Missing~brace~inserted~when~#1. } { LaTeX~was~asked~to~do~some~regular~expression~operation,~ and~the~resulting~token~list~would~not~have~the~same~number~ of~begin-group~and~end-group~tokens.~Braces~were~inserted:~ #2~left,~#3~right. } % \end{macrocode} % % Error message for unknown options. % \begin{macrocode} \msg_new:nnnn { regex } { unknown-option } { Unknown~option~'#1'~for~regular~expressions. } { The~only~available~option~is~'case-insensitive',~toggled~by~ '(?i)'~and~'(?-i)'. } \msg_new:nnnn { regex } { special-group-unknown } { Unknown~special~group~'#1~...'~in~a~regular~expression. } { The~only~valid~constructions~starting~with~'(?'~are~ '(?:~...~)',~'(?|~...~)',~'(?i)',~and~'(?-i)'. } % \end{macrocode} % % Errors in the replacement text. % \begin{macrocode} \msg_new:nnnn { regex } { replacement-c } { Misused~'\iow_char:N\\c'~command~in~a~replacement~text. } { In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~ can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~ or~a~brace~group,~not~by~'#1'. } \msg_new:nnnn { regex } { replacement-u } { Misused~'\iow_char:N\\u'~command~in~a~replacement~text. } { In~a~replacement~text,~the~'\iow_char:N\\u'~escape~sequence~ must~be~~followed~by~a~brace~group~holding~the~name~of~the~ variable~to~use. } \msg_new:nnnn { regex } { replacement-g } { Missing~brace~for~the~'\iow_char:N\\g'~construction~ in~a~replacement~text. } { In~the~replacement~text~for~a~regular~expression~search,~ submatches~are~represented~either~as~'\iow_char:N \\g{dd..d}',~ or~'\\d',~where~'d'~are~single~digits.~Here,~a~brace~is~missing. } \msg_new:nnnn { regex } { replacement-catcode-end } { Missing~character~for~the~'\iow_char:N\\c'~ construction~in~a~replacement~text. } { In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~ can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~representing~ the~character~category.~Then,~a~character~must~follow.~LaTeX~ reached~the~end~of~the~replacement~when~looking~for~that. } \msg_new:nnnn { regex } { replacement-catcode-escaped } { Escaped~letter~or~digit~after~category~code~in~replacement~text. } { In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~ can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~representing~ the~character~category.~Then,~a~character~must~follow,~not~ '\iow_char:N\\#2'. } \msg_new:nnnn { regex } { replacement-catcode-in-cs } { Category~code~'\iow_char:N\\c#1#3'~ignored~inside~ '\iow_char:N\\c\{...\}'~in~a~replacement~text. } { In~a~replacement~text,~the~category~codes~of~the~argument~of~ '\iow_char:N\\c\{...\}'~are~ignored~when~building~the~control~ sequence~name. } \msg_new:nnnn { regex } { replacement-null-space } { TeX~cannot~build~a~space~token~with~character~code~0. } { You~asked~for~a~character~token~with~category~space,~ and~character~code~0,~for~instance~through~ '\iow_char:N\\cS\iow_char:N\\x00'.~ This~specific~case~is~impossible~and~will~be~replaced~ by~a~normal~space. } \msg_new:nnnn { regex } { replacement-missing-rbrace } { Missing~right~brace~inserted~in~replacement~text. } { There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~ missing~right~\int_compare:nTF { #1 = 1 } { brace } { braces } . } \msg_new:nnnn { regex } { replacement-missing-rparen } { Missing~right~parenthesis~inserted~in~replacement~text. } { There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~ missing~right~ \int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } . } \msg_new:nnn { regex } { submatch-too-big } { Submatch~#1~used~but~regex~only~has~#2~group(s) } % \end{macrocode} % % Some escaped alphanumerics are not allowed everywhere. % \begin{macrocode} \msg_new:nnnn { regex } { backwards-quantifier } { Quantifer~"{#1,#2}"~is~backwards. } { The~values~given~in~a~quantifier~must~be~in~order. } % \end{macrocode} % % Used in user commands, and when showing a regex. % \begin{macrocode} \msg_new:nnnn { regex } { case-odd } { #1~with~odd~number~of~items } { There~must~be~a~#2~part~for~each~regex:~ found~odd~number~of~items~(#3)~in\\ \iow_indent:n {#4} } \msg_new:nnn { regex } { show } { >~Compiled~regex~ \tl_if_empty:nTF {#1} { variable~ #2 } { {#1} } : #3 } \prop_gput:Nnn \g_msg_module_name_prop { regex } { LaTeX } \prop_gput:Nnn \g_msg_module_type_prop { regex } { } % \end{macrocode} % % \begin{macro}{\@@_msg_repeated:nnN} % This is not technically a message, but seems related enough to go % there. The arguments are: |#1| is the minimum number of repetitions; % |#2| is the number of allowed extra repetitions ($-1$ for infinite % number), and |#3| tells us about laziness. % \begin{macrocode} \cs_new:Npn \@@_msg_repeated:nnN #1#2#3 { \str_if_eq:eeF { #1 #2 } { 1 0 } { , ~ repeated ~ \int_case:nnF {#2} { { -1 } { #1~or~more~times,~\bool_if:NTF #3 { lazy } { greedy } } { 0 } { #1~times } } { between~#1~and~\int_eval:n {#1+#2}~times,~ \bool_if:NTF #3 { lazy } { greedy } } } } % \end{macrocode} % \end{macro} % % \subsection{Code for tracing} % % There is a more extensive implementation of tracing in the l3trial % package \pkg{l3trace}. Function names are a bit different but could % be merged. % % \begin{macro} % {\@@_trace_push:nnN, \@@_trace_pop:nnN, \@@_trace:nne} % Here |#1| is the module name (\texttt{regex}) and |#2| is % typically~1. If the module's current tracing level is less than % |#2| show nothing, otherwise write |#3| to the terminal. % \begin{macrocode} \cs_new_protected:Npn \@@_trace_push:nnN #1#2#3 { \@@_trace:nne {#1} {#2} { entering~ \token_to_str:N #3 } } \cs_new_protected:Npn \@@_trace_pop:nnN #1#2#3 { \@@_trace:nne {#1} {#2} { leaving~ \token_to_str:N #3 } } \cs_new_protected:Npn \@@_trace:nne #1#2#3 { \int_compare:nNnF { \int_use:c { g_@@_trace_#1_int } } < {#2} { \iow_term:e { Trace:~#3 } } } % \end{macrocode} % \end{macro} % % \begin{variable}{\g_@@_trace_regex_int} % No tracing when that is zero. % \begin{macrocode} \int_new:N \g_@@_trace_regex_int % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_trace_states:n} % This function lists the contents of all states of the \textsc{nfa}, % stored in \tn{toks} from $0$ to \cs{l_@@_max_state_int} % (excluded). % \begin{macrocode} \cs_new_protected:Npn \@@_trace_states:n #1 { \int_step_inline:nnn \l_@@_min_state_int { \l_@@_max_state_int - \c_one_int } { \@@_trace:nne { regex } {#1} { \iow_char:N \\toks ##1 = { \@@_toks_use:w ##1 } } } } % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex % \endinput %^^A NOT IMPLEMENTED %^^A \p{xx} a character with the xx property %^^A \P{xx} a character without the xx property %^^A (?=...) positive look ahead %^^A (?!...) negative look ahead %^^A (?<=...) positive look behind %^^A (?...) or (?'name'...) or (?P...) %^^A named capturing group %^^A \R a newline sequence %^^A \X an extended Unicode sequence %^^A (?C) or (?Cn) callout with data n %^^A (?R) recurse whole pattern %^^A (?[+-]n) or \g<[+-]n> or (?&name) or (?P>name) or \g %^^A call subpattern %^^A (?([+-]n)... or (?()... %^^A reference condition %^^A (?(R)... or (?(Rn)... or (?(R&name)... %^^A recursion condition %^^A (?(DEFINE)... define subpattern for reference %^^A (?(assert)... assertion condition %^^A (?(?=..)..|..) positive/negative look ahead/behind condition %^^A (*ACCEPT) force successful match %^^A (*FAIL) force backtrack; synonym (*F) %^^A (*COMMIT) overall failure, no advance of starting point %^^A (*PRUNE) advance to next starting character %^^A (*SKIP) advance start to current matching position %^^A (*THEN) local failure, backtrack to next alternation %^^A (*CR) or (*LF) or (*CRLF) or (*ANYCRLF) or (*ANY) %^^A newline convention %^^A (*BSR_ANYCRLF) or (*BSR_UNICODE) %^^A change what \R matches. %^^A %^^A \cx "control-x", where x is any ASCII character %^^A \C one byte, even in UTF-8 mode (best avoided) %^^A + possessive quantifiers %^^A (?>...) atomic, non-capturing group %^^A (?#....) comment (not nestable) %^^A (?Jms-UxX) options (duplicate names; multiline; single line; %^^A unset what follows; ungreedy; extended; %^^A error on bad escapes) %^^A (?i:...|...) convenient shorthand for (?:(?i)...|...) %^^A (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE) %^^A (*UTF8) set UTF-8 mode (PCRE_UTF8) %^^A (*UCP) set PCRE_UCP (use Unicode properties for \d etc) %^^A \n or \gn or \g{[-]n} or \g{name} or (?P=name) %^^A or \k or \k'name' or \k{name} %^^A back-references