% \iffalse meta-comment % %% File: l3text-purify.dtx % % Copyright (C) 2020-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3text-purify} module\\ Text processing (purification)^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-03-14} % % \maketitle % % \begin{documentation} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3text-purify} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=text> % \end{macrocode} % % \subsection{Purifying text} % % \begin{macro}[EXP]{\@@_if_recursion_tail_stop:N} % Functions to query recursion quarks. % \begin{macrocode} \__kernel_quark_new_test:N \@@_if_recursion_tail_stop:N % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\text_purify:n, \@@_purify:n} % \begin{macro}[EXP]{\@@_purify_store:n} % \begin{macro}[EXP]{\@@_purify_store:nw} % \begin{macro}[EXP]{\@@_purify_end:w} % \begin{macro}[EXP]{\@@_purify_loop:w} % \begin{macro}[EXP]{\@@_purify_group:n} % \begin{macro}[EXP]{\@@_purify_space:w} % \begin{macro}[EXP]{\@@_purify_N_type:N, \@@_purify_N_type_aux:N} % \begin{macro}[EXP]{\@@_purify_math_search:NNN} % \begin{macro}[EXP]{\@@_purify_math_start:NNw} % \begin{macro}[EXP]{\@@_purify_math_store:n} % \begin{macro}[EXP]{\@@_purify_math_store:nw} % \begin{macro}[EXP]{\@@_purify_math_end:w} % \begin{macro}[EXP]{\@@_purify_math_loop:NNw} % \begin{macro}[EXP]{\@@_purify_math_N_type:NNN} % \begin{macro}[EXP]{\@@_purify_math_group:NNn} % \begin{macro}[EXP]{\@@_purify_math_space:NNw} % \begin{macro}[EXP]{\@@_purify_math_cmd:N} % \begin{macro}[EXP]{\@@_purify_math_cmd:NN} % \begin{macro}[EXP]{\@@_purify_math_cmd:Nn} % \begin{macro}[EXP]{\@@_purify_replace:N} % \begin{macro}[EXP]{\@@_purify_replace_auxi:n, \@@_purify_replace_auxii:n} % \begin{macro}[EXP]{\@@_purify_expand:N, \@@_purify_protect:N, \@@_purify_encoding:N} % \begin{macro}[EXP]{\@@_purify_encoding_escape:NN} % As in the other parts of the module, we start off with a standard % \enquote{action} loop, with expansion applied up-front. % \begin{macrocode} \cs_new:Npn \text_purify:n #1 { \__kernel_exp_not:w \exp_after:wN { \exp:w \exp_args:Ne \@@_purify:n { \text_expand:n {#1} } } } \cs_new:Npn \@@_purify:n #1 { \group_align_safe_begin: \@@_purify_loop:w #1 \q_@@_recursion_tail \q_@@_recursion_stop \@@_purify_result:n { } } % \end{macrocode} % As for expansion, collect up the tokens for future use. % \begin{macrocode} \cs_new:Npn \@@_purify_store:n #1 { \@@_purify_store:nw {#1} } \cs_new:Npn \@@_purify_store:nw #1#2 \@@_purify_result:n #3 { #2 \@@_purify_result:n { #3 #1 } } \cs_new:Npn \@@_purify_end:w #1 \@@_purify_result:n #2 { \group_align_safe_end: \exp_end: #2 } % \end{macrocode} % The main loop is a standard \enquote{tl action}. Unlike the expansion % or case changing, here any groups have to be run inline. Most of the % business end is as before in the \texttt{N}-type token processing. % \begin{macrocode} \cs_new:Npn \@@_purify_loop:w #1 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#1} { \@@_purify_N_type:N } { \tl_if_head_is_group:nTF {#1} { \@@_purify_group:n } { \@@_purify_space:w } } #1 \q_@@_recursion_stop } \cs_new:Npn \@@_purify_group:n #1 { \@@_purify_loop:w #1 } \exp_last_unbraced:NNo \cs_new:Npn \@@_purify_space:w \c_space_tl { \@@_purify_store:n { ~ } \@@_purify_loop:w } % \end{macrocode} % The first part of handling math mode is exactly the same as in the % other functions: look for a start-of-math mode token and if found start % a new loop tracking the closing token. % \begin{macrocode} \cs_new:Npn \@@_purify_N_type:N #1 { \@@_if_q_recursion_tail_stop_do:Nn #1 { \@@_purify_end:w } \@@_purify_N_type_aux:N #1 } \cs_new:Npn \@@_purify_N_type_aux:N #1 { \exp_after:wN \@@_purify_math_search:NNN \exp_after:wN #1 \l_text_math_delims_tl \q_@@_recursion_tail ? \q_@@_recursion_stop } \cs_new:Npn \@@_purify_math_search:NNN #1#2#3 { \@@_if_q_recursion_tail_stop_do:Nn #2 { \@@_purify_math_cmd:N #1 } \token_if_eq_meaning:NNTF #1 #2 { \@@_use_i_delimit_by_q_recursion_stop:nw { \@@_purify_math_start:NNw #2 #3 } } { \@@_purify_math_search:NNN #1 } } \cs_new:Npn \@@_purify_math_start:NNw #1#2#3 \q_@@_recursion_stop { \@@_purify_math_loop:NNw #1#2#3 \q_@@_recursion_stop \@@_purify_math_result:n { } } \cs_new:Npn \@@_purify_math_store:n #1 { \@@_purify_math_store:nw {#1} } \cs_new:Npn \@@_purify_math_store:nw #1#2 \@@_purify_math_result:n #3 { #2 \@@_purify_math_result:n { #3 #1 } } \cs_new:Npn \@@_purify_math_end:w #1 \@@_purify_math_result:n #2 { \@@_purify_store:n { $ #2 $ } \@@_purify_loop:w #1 } \cs_new:Npn \@@_purify_math_stop:Nw #1 \@@_purify_math_result:n #2 { \@@_purify_store:n {#1#2} \@@_purify_end:w } \cs_new:Npn \@@_purify_math_loop:NNw #1#2#3 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#3} { \@@_purify_math_N_type:NNN } { \tl_if_head_is_group:nTF {#3} { \@@_purify_math_group:NNn } { \@@_purify_math_space:NNw } } #1#2#3 \q_@@_recursion_stop } \cs_new:Npn \@@_purify_math_N_type:NNN #1#2#3 { \@@_if_q_recursion_tail_stop_do:Nn #3 { \@@_purify_math_stop:Nw #1 } \token_if_eq_meaning:NNTF #3 #2 { \@@_purify_math_end:w } { \@@_purify_math_store:n {#3} \@@_purify_math_loop:NNw #1#2 } } \cs_new:Npn \@@_purify_math_group:NNn #1#2#3 { \@@_purify_math_store:n { {#3} } \@@_purify_math_loop:NNw #1#2 } \exp_after:wN \cs_new:Npn \exp_after:wN \@@_purify_math_space:NNw \exp_after:wN # \exp_after:wN 1 \exp_after:wN # \exp_after:wN 2 \c_space_tl { \@@_purify_math_store:n { ~ } \@@_purify_math_loop:NNw #1#2 } % \end{macrocode} % Then handle math mode as an argument: same outcomes, different input % syntax. % \begin{macrocode} \cs_new:Npn \@@_purify_math_cmd:N #1 { \exp_after:wN \@@_purify_math_cmd:NN \exp_after:wN #1 \l_text_math_arg_tl \q_@@_recursion_tail \q_@@_recursion_stop } \cs_new:Npn \@@_purify_math_cmd:NN #1#2 { \@@_if_q_recursion_tail_stop_do:Nn #2 { \@@_purify_replace:N #1 } \cs_if_eq:NNTF #2 #1 { \@@_use_i_delimit_by_q_recursion_stop:nw { \@@_purify_math_cmd:n } } { \@@_purify_math_cmd:NN #1 } } \cs_new:Npn \@@_purify_math_cmd:n #1 { \@@_purify_math_end:w \@@_purify_math_result:n {#1} } % \end{macrocode} % For \texttt{N}-type tokens, we first look for a string-context replacement % before anything else: this can therefore cover anything. Assuming we don't % find one, check to see if we can expand control sequences: if not, they have % to be dropped. We also allow for \LaTeXe{} \tn{protect}: there's an % assumption that we don't have |\protect { \oops }| or similar, but that's % also in the expansion code and seems like a reasonable balance. % \begin{macrocode} \cs_new:Npn \@@_purify_replace:N #1 { \bool_lazy_and:nnTF { \cs_if_exist_p:c { l_@@_purify_ \token_to_str:N #1 _tl } } { \bool_lazy_or_p:nn { \token_if_cs_p:N #1 } { \token_if_active_p:N #1 } } { \exp_args:Nv \@@_purify_replace_auxi:n { l_@@_purify_ \token_to_str:N #1 _tl } } { \exp_args:Ne \@@_purify_replace_auxii:n { \@@_token_to_explicit:N #1 } } } \cs_new:Npn \@@_purify_replace_auxi:n #1 { \@@_purify_loop:w #1 } \cs_new:Npn \@@_purify_replace_auxii:n #1 { \token_if_cs:NTF #1 { \@@_purify_expand:N #1 } { \@@_purify_store:n {#1} \@@_purify_loop:w } } \cs_new:Npn \@@_purify_expand:N #1 { \str_if_eq:nnTF {#1} { \protect } { \@@_purify_protect:N } { \@@_purify_encoding:N #1 } } \cs_new:Npn \@@_purify_protect:N #1 { \@@_if_q_recursion_tail_stop_do:Nn #1 { \@@_purify_end:w } \@@_purify_loop:w } % \end{macrocode} % Handle encoding commands, as detailed for expansion. % \begin{macrocode} \cs_new:Npn \@@_purify_encoding:N #1 { \bool_lazy_or:nnTF { \cs_if_eq_p:NN #1 \@current@cmd } { \cs_if_eq_p:NN #1 \@changed@cmd } { \@@_purify_encoding_escape:NN } { \@@_if_expandable:NTF #1 { \exp_after:wN \@@_purify_loop:w #1 } { \@@_purify_loop:w } } } \cs_new:Npn \@@_purify_encoding_escape:NN #1#2 { \@@_purify_store:n {#1} \@@_purify_loop:w } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro} % { % \text_declare_purify_equivalent:Nn, % \text_declare_purify_equivalent:Ne % } % \begin{macrocode} \cs_new_protected:Npn \text_declare_purify_equivalent:Nn #1#2 { \tl_clear_new:c { l_@@_purify_ \token_to_str:N #1 _tl } \tl_set:cn { l_@@_purify_ \token_to_str:N #1 _tl } {#2} } \cs_generate_variant:Nn \text_declare_purify_equivalent:Nn { Ne } % \end{macrocode} % \end{macro} % % Now pre-define a range of standard commands that need dedicated definitions % in purified text. First handle font-related stuff: all of this needs to be % disabled. % \begin{macrocode} \tl_map_inline:nn { \fontencoding \fontfamily \fontseries \fontshape } { \text_declare_purify_equivalent:Nn #1 { \use_none:n } } \text_declare_purify_equivalent:Nn \fontsize { \use_none:nn } \text_declare_purify_equivalent:Nn \selectfont { } \text_declare_purify_equivalent:Nn \usefont { \use_none:nnnn } \tl_map_inline:nn { \emph \text \textnormal \textrm \textsf \texttt \textbf \textmd \textit \textsl \textup \textsc \textulc } { \text_declare_purify_equivalent:Nn #1 { \use:n } } \tl_map_inline:nn { \normalfont \rmfamily \sffamily \ttfamily \bfseries \mdseries \itshape \scshape \slshape \upshape \em \Huge \LARGE \Large \footnotesize \huge \large \normalsize \scriptsize \small \tiny } { \text_declare_purify_equivalent:Nn #1 { } } \exp_args:Nc \text_declare_purify_equivalent:Nn { @protected@testopt } { \use_none:nnn } % \end{macrocode} % Environments have to be handled by pure expansion. % % \begin{macro}{\@@_end_env:n} % \begin{macrocode} \text_declare_purify_equivalent:Nn \begin { \use:c } \text_declare_purify_equivalent:Nn \end { \@@_end_env:n } \cs_new:Npn \@@_end_env:n #1 { \cs:w end #1 \cs_end: } % \end{macrocode} % \end{macro} % % Some common symbols and similar ideas. % \begin{macrocode} \text_declare_purify_equivalent:Nn \\ { } \tl_map_inline:nn { \{ \} \# \$ \% \_ } { \text_declare_purify_equivalent:Ne #1 { \cs_to_str:N #1 } } % \end{macrocode} % Cross-referencing. % \begin{macrocode} \text_declare_purify_equivalent:Nn \label { \use_none:n } % \end{macrocode} % Spaces. % \begin{macrocode} \group_begin: \char_set_catcode_active:N \~ \use:n { \group_end: \text_declare_purify_equivalent:Ne ~ { \c_space_tl } } \text_declare_purify_equivalent:Nn \nobreakspace { ~ } \text_declare_purify_equivalent:Nn \ { ~ } \text_declare_purify_equivalent:Nn \, { ~ } % \end{macrocode} % % \subsection{Accent and letter-like data for purifying text} % % In contrast to case changing, both $8$-bit and Unicode engines need % information for text purification to handle accents and letter-like % functions: these all need to be removed. However, the results are % of course engine-dependent. % % For the letter-like commands, life is relatively easy: they are all % simply added as standard exceptions. The only oddity is \tn{SS}, which % gets converted to two letters. (At some stage an alternative version % can presumably be added to \pkg{babel} or similar.) % \begin{macrocode} \cs_set_protected:Npn \@@_loop:Nn #1#2 { \quark_if_recursion_tail_stop:N #1 \text_declare_purify_equivalent:Ne #1 { \codepoint_generate:nn {"#2} { \char_value_catcode:n {"#2} } } \@@_loop:Nn } \@@_loop:Nn \AA { 00C5 } \AE { 00C6 } \DH { 00D0 } \DJ { 0110 } \IJ { 0132 } \L { 0141 } \NG { 014A } \O { 00D8 } \OE { 0152 } \TH { 00DE } \aa { 00E5 } \ae { 00E6 } \dh { 00F0 } \dj { 0111 } \i { 0131 } \j { 0237 } \ij { 0132 } \l { 0142 } \ng { 014B } \o { 00F8 } \oe { 0153 } \ss { 00DF } \th { 00FE } \q_recursion_tail ? \q_recursion_stop \text_declare_purify_equivalent:Nn \SS { SS } % \end{macrocode} % % \begin{macro}[rEXP]{\@@_purify_accent:NN} % Accent \textsc{licr} handling is a little more complex. Accents may exist % as pre-composed codepoints or as independent glyphs. The former are all % saved as single token lists, whilst for the latter the combining accent % needs to be re-ordered compared to the character it applies to. % \begin{macrocode} \cs_new:Npn \@@_purify_accent:NN #1#2 { \cs_if_exist:cTF { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl } { \exp_not:v { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl } } { \exp_not:n {#2} \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl } } } \tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t } { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } } % \end{macrocode} % First set up the combining accents. % \begin{macrocode} \group_begin: \cs_set_protected:Npn \@@_loop:Nn #1#2 { \quark_if_recursion_tail_stop:N #1 \tl_const:ce { c_@@_purify_ \token_to_str:N #1 _tl } { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } } \@@_loop:Nn } \@@_loop:Nn \` { 0300 } \' { 0301 } \^ { 0302 } \~ { 0303 } \= { 0304 } \u { 0306 } \. { 0307 } \" { 0308 } \r { 030A } \H { 030B } \v { 030C } \d { 0323 } \c { 0327 } \k { 0328 } \b { 0331 } \t { 0361 } \q_recursion_tail { } \q_recursion_stop % \end{macrocode} % Now we handle the pre-composed accents: the list here is taken from % \texttt{puenc.def}. All of the precomposed cases take a single letter % as their second argument. We do not try to cover the case where an accent % is added to a \enquote{real} dotless-i or -j, or a \ae/\AE. Rather, we % assume that if the \textsc{utf}-8 character is used, it will have the % real accent character too. % \begin{macrocode} \cs_set_protected:Npn \@@_loop:NNn #1#2#3 { \quark_if_recursion_tail_stop:N #1 \tl_const:ce { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl } { \codepoint_generate:nn {"#3} { \char_value_catcode:n { "#3 } } } \@@_loop:NNn } \@@_loop:NNn \` A { 00C0 } \' A { 00C1 } \^ A { 00C2 } \~ A { 00C3 } \" A { 00C4 } \r A { 00C5 } \c C { 00C7 } \` E { 00C8 } \' E { 00C9 } \^ E { 00CA } \" E { 00CB } \` I { 00CC } \' I { 00CD } \^ I { 00CE } \" I { 00CF } \~ N { 00D1 } \` O { 00D2 } \' O { 00D3 } \^ O { 00D4 } \~ O { 00D5 } \" O { 00D6 } \` U { 00D9 } \' U { 00DA } \^ U { 00DB } \" U { 00DC } \' Y { 00DD } \` a { 00E0 } \' a { 00E1 } \^ a { 00E2 } \~ a { 00E3 } \" a { 00E4 } \r a { 00E5 } \c c { 00E7 } \` e { 00E8 } \' e { 00E9 } \^ e { 00EA } \" e { 00EB } \` i { 00EC } \` \i { 00EC } \' i { 00ED } \' \i { 00ED } \^ i { 00EE } \^ \i { 00EE } \" i { 00EF } \" \i { 00EF } \~ n { 00F1 } \` o { 00F2 } \' o { 00F3 } \^ o { 00F4 } \~ o { 00F5 } \" o { 00F6 } \` u { 00F9 } \' u { 00FA } \^ u { 00FB } \" u { 00FC } \' y { 00FD } \" y { 00FF } \= A { 0100 } \= a { 0101 } \u A { 0102 } \u a { 0103 } \k A { 0104 } \k a { 0105 } \' C { 0106 } \' c { 0107 } \^ C { 0108 } \^ c { 0109 } \. C { 010A } \. c { 010B } \v C { 010C } \v c { 010D } \v D { 010E } \v d { 010F } \= E { 0112 } \= e { 0113 } \u E { 0114 } \u e { 0115 } \. E { 0116 } \. e { 0117 } \k E { 0118 } \k e { 0119 } \v E { 011A } \v e { 011B } \^ G { 011C } \^ g { 011D } \u G { 011E } \u g { 011F } \. G { 0120 } \. g { 0121 } \c G { 0122 } \c g { 0123 } \^ H { 0124 } \^ h { 0125 } \~ I { 0128 } \~ i { 0129 } \~ \i { 0129 } \= I { 012A } \= i { 012B } \= \i { 012B } \u I { 012C } \u i { 012D } \u \i { 012D } \k I { 012E } \k i { 012F } \k \i { 012F } \. I { 0130 } \^ J { 0134 } \^ j { 0135 } \^ \j { 0135 } \c K { 0136 } \c k { 0137 } \' L { 0139 } \' l { 013A } \c L { 013B } \c l { 013C } \v L { 013D } \v l { 013E } \. L { 013F } \. l { 0140 } \' N { 0143 } \' n { 0144 } \c N { 0145 } \c n { 0146 } \v N { 0147 } \v n { 0148 } \= O { 014C } \= o { 014D } \u O { 014E } \u o { 014F } \H O { 0150 } \H o { 0151 } \' R { 0154 } \' r { 0155 } \c R { 0156 } \c r { 0157 } \v R { 0158 } \v r { 0159 } \' S { 015A } \' s { 015B } \^ S { 015C } \^ s { 015D } \c S { 015E } \c s { 015F } \v S { 0160 } \v s { 0161 } \c T { 0162 } \c t { 0163 } \v T { 0164 } \v t { 0165 } \~ U { 0168 } \~ u { 0169 } \= U { 016A } \= u { 016B } \u U { 016C } \u u { 016D } \r U { 016E } \r u { 016F } \H U { 0170 } \H u { 0171 } \k U { 0172 } \k u { 0173 } \^ W { 0174 } \^ w { 0175 } \^ Y { 0176 } \^ y { 0177 } \" Y { 0178 } \' Z { 0179 } \' z { 017A } \. Z { 017B } \. z { 017C } \v Z { 017D } \v z { 017E } \v A { 01CD } \v a { 01CE } \v I { 01CF } \v \i { 01D0 } \v i { 01D0 } \v O { 01D1 } \v o { 01D2 } \v U { 01D3 } \v u { 01D4 } \v G { 01E6 } \v g { 01E7 } \v K { 01E8 } \v k { 01E9 } \k O { 01EA } \k o { 01EB } \v \j { 01F0 } \v j { 01F0 } \' G { 01F4 } \' g { 01F5 } \` N { 01F8 } \` n { 01F9 } \' \AE { 01FC } \' \ae { 01FD } \' \O { 01FE } \' \o { 01FF } \v H { 021E } \v h { 021F } \. A { 0226 } \. a { 0227 } \c E { 0228 } \c e { 0229 } \. O { 022E } \. o { 022F } \= Y { 0232 } \= y { 0233 } \q_recursion_tail ? { } \q_recursion_stop \group_end: % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex