% \iffalse meta-comment % %% File: l3text-map.dtx % % Copyright (C) 2022-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3text-map} module\\ Text processing (mapping)^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-03-14} % % \maketitle % % \begin{documentation} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3text-map} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=text> % \end{macrocode} % % \subsection{Mapping to text} % % \begin{macro}[EXP]{\text_map_function:nN} % \begin{macro}[EXP]{\@@_map_function:nN} % \begin{macro}[EXP]{\@@_map_loop:Nnw} % \begin{macro}[EXP]{\@@_map_group:Nnn} % \begin{macro}[EXP]{\@@_map_space:Nnw} % \begin{macro}[EXP]{\@@_map_N_type:NnN} % \begin{macro}[EXP]{\@@_map_codepoint:Nnn} % \begin{macro}[EXP]{\@@_map_CR:Nnw} % \begin{macro}[EXP]{\@@_map_CR:NnN} % \begin{macro}[EXP]{\@@_map_class:Nnnn} % \begin{macro}[EXP]{\@@_map_class:nNnnn} % \begin{macro}[EXP]{\@@_map_class_loop:Nnnnw} % \begin{macro}[EXP]{\@@_map_class_end:nw} % \begin{macro}[EXP] % { % \@@_map_Control:Nnn , % \@@_map_Extend:Nnn , % \@@_map_SpacingMark:Nnn , % \@@_map_Prepend:Nnn , % \@@_map_Prepend_aux:Nnn % } % \begin{macro}[EXP]{\@@_map_Prepend:nNnn} % \begin{macro}[EXP]{\@@_map_Prepend_loop:Nnnw} % \begin{macro}[EXP] % { % \@@_map_not_Control:Nnn , % \@@_map_not_Extend:Nnn , % \@@_map_not_SpacingMark:Nnn , % \@@_map_not_Prepend:Nnn , % \@@_map_not_L:Nnn , % \@@_map_not_LV:Nnn , % \@@_map_not_V:Nnn , % \@@_map_not_LVT:Nnn , % \@@_map_not_T:Nnn % } % \begin{macro}[EXP] % { % \@@_map_L:Nnn , % \@@_map_LV:Nnn , % \@@_map_V:Nnn , % \@@_map_LVT:Nnn , % \@@_map_T:Nnn % } % \begin{macro}[EXP]{\@@_map_hangul:Nnnw} % \begin{macro}[EXP]{\@@_map_hangul:NnnN} % \begin{macro}[EXP]{\@@_map_hangul:Nnnn} % \begin{macro}[EXP]{\@@_map_hangul_aux:Nnnnw} % \begin{macro}[EXP]{\@@_map_hangul:nNnnnw} % \begin{macro}[EXP]{\@@_map_hangul_loop:Nnnnnw} % \begin{macro}[EXP]{\@@_map_hangul_next:Nnnn} % \begin{macro}[EXP]{\@@_map_hangul_end:nw} % \begin{macro}[EXP] % { % \@@_map_hangul_L:Nnn , % \@@_map_hangul_LV:Nnn , % \@@_map_hangul_V:Nnn , % \@@_map_hangul_LVT:Nnn , % \@@_map_hangul_T:Nnn % } % \begin{macro}[EXP] % {\@@_map_Regional_Indicator:Nnn, \@@_map_Regional_Indicator_aux:Nnn} % \begin{macro}[EXP]{\@@_map_lookahead:NnNw} % \begin{macro}[EXP]{\@@_map_lookahead:NnNN} % \begin{macro}[EXP]{\@@_map_output:Nn} % \begin{macro}[EXP]{\text_map_break:} % \begin{macro}[EXP]{\text_map_break:n} % The standard lead-off for an action loop. % \begin{macrocode} \cs_new:Npn \text_map_function:nN #1#2 { \exp_args:Ne \@@_map_function:nN { \text_expand:n {#1} } #2 } \cs_new:Npn \@@_map_function:nN #1#2 { \@@_map_loop:Nnw #2 { } #1 \q_@@_recursion_tail \q_@@_recursion_stop \prg_break_point:Nn \text_map_break: { } } % \end{macrocode} % The standard set up for an \enquote{action} loop. Groups are handled by % recursion, spaces are treated similarly: both count as grapheme boundaries. % For \texttt{N}-type tokens, we filter out control sequences (again % a boundary), then move on to further analysis. % \begin{macrocode} \cs_new:Npn \@@_map_loop:Nnw #1#2#3 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#3} { \@@_map_N_type:NnN } { \tl_if_head_is_group:nTF {#3} { \@@_map_group:Nnn } { \@@_map_space:Nnw } } #1 {#2} #3 \q_@@_recursion_stop } \cs_new:Npn \@@_map_group:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} { \@@_map_loop:Nnw #1 { } #2 \q_@@_recursion_tail \q_@@_recursion_stop \prg_break_point:Nn \text_map_break: { } } \@@_map_loop:Nnw #1 { } } \use:e { \cs_new:Npn \exp_not:N \@@_map_space:Nnw #1#2 \c_space_tl } { \@@_map_output:Nn #1 {#2} #1 { ~ } \@@_map_loop:Nnw #1 { } } \cs_new:Npn \@@_map_N_type:NnN #1#2#3 { \@@_if_q_recursion_tail_stop_do:Nn #3 { \@@_map_output:Nn #1 {#2} \text_map_break: } \token_if_cs:NTF #3 { \@@_map_output:Nn #1 {#2} #1 {#3} \@@_map_loop:Nnw #1 { } } { \@@_codepoint_process:nN { \@@_map_codepoint:Nnn #1 {#2} } #3 } } % \end{macrocode} % We pull out a few special cases here. Carriage returns case needs a bit of % context handling so has an auxiliary. Codepoint U+200D is the zero-width % joiner, which has no context to concern us: just don't break. % \begin{macrocode} \cs_new:Npn \@@_map_codepoint:Nnn #1#2#3 { \@@_codepoint_compare:nNnTF {#3} = { "0D } { \@@_map_output:Nn #1 {#2} \@@_map_CR:Nnw #1 {#3} } { \@@_codepoint_compare:nNnTF {#3} = { "200D } { \@@_map_loop:Nnw #1 {#2#3} } { \@@_map_class:Nnnn #1 {#2} {#3} { Control } } } } % \end{macrocode} % A carriage return is a boundary unless it is immediately followed by % a line feed, in which case that pair is a boundary. % \begin{macrocode} \cs_new:Npn \@@_map_CR:Nnw #1#2#3 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#3} { \@@_map_CR:NnN #1 {#2} } { #1 {#2} \@@_map_loop:Nnw #1 { } } #3 \q_@@_recursion_stop } \cs_new:Npn \@@_map_CR:NnN #1#2#3 { \@@_if_q_recursion_tail_stop_do:Nn #3 { #1 {#2} \text_map_break: } \bool_lazy_and:nnTF { ! \token_if_cs_p:N #3 } { \int_compare_p:nNn { `#3 } = { "0A } } { \@@_map_output:Nn #1 {#2#3} \@@_map_loop:Nnw #1 { } } { \@@_map_loop:Nnw #1 { } #3 } } % \end{macrocode} % There are various classes of character, and we deal with them all in % the same general way. We need to example the relevant list of codepoints: % if we get a hit, then we do whatever the relevant action is. Otherwise % we loop, but only if the current codepoint could still match: the % loop stops early otherwise and we move forward. % \begin{macrocode} \cs_new:Npn \@@_map_class:Nnnn #1#2#3#4 { \exp_args:Nv \@@_map_class:nNnnn { c_@@_grapheme_ #4 _clist } #1 {#2} {#3} {#4} } \cs_new:Npn \@@_map_class:nNnnn #1#2#3#4#5 { \@@_map_class_loop:Nnnnw #2 {#3} {#4} {#5} #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop } \cs_new:Npn \@@_map_class_loop:Nnnnw #1#2#3#4 #5 .. #6 , { \@@_if_q_recursion_tail_stop_do:nn {#5} { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} } \@@_codepoint_compare:nNnTF {#3} < { "#5 } { \@@_map_class_end:nw { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} } } { \@@_codepoint_compare:nNnTF {#3} > { "#6 } { \@@_map_class_loop:Nnnnw #1 {#2} {#3} {#4} } { \@@_map_class_end:nw { \use:c { @@_map_ #4 :Nnn } #1 {#2} {#3} } } } } \cs_new:Npn \@@_map_class_end:nw #1#2 \q_@@_recursion_stop {#1} % \end{macrocode} % Break before \emph{and} after. % \begin{macrocode} \cs_new:Npn \@@_map_Control:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_output:Nn #1 {#3} \@@_map_loop:Nnw #1 { } } % \end{macrocode} % Keep collecting. % \begin{macrocode} \cs_new:Npn \@@_map_Extend:Nnn #1#2#3 { \@@_map_loop:Nnw #1 {#2#3} } \cs_new_eq:NN \@@_map_SpacingMark:Nnn \@@_map_Extend:Nnn % \end{macrocode} % Outputting anything earlier, the combine with what follows. The only % exclusions are control characters. % \begin{macrocode} \cs_new:Npn \@@_map_Prepend:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_lookahead:NnNw #1 {#3} \@@_map_Prepend_aux:Nnn } \cs_new:Npn \@@_map_Prepend_aux:Nnn #1#2#3 { \bool_lazy_or:nnTF { \@@_codepoint_compare_p:nNn {#3} = { "0A } } { \@@_codepoint_compare_p:nNn {#3} = { "0D } } { #1 {#2} \@@_map_loop:Nnw #1 {#3} } { \exp_args:NV \@@_map_Prepend:nNnn \c_@@_grapheme_Control_clist #1 {#2} {#3} } } \cs_new:Npn \@@_map_Prepend:nNnn #1#2#3#4 { \@@_map_Prepend_loop:Nnnw #2 {#3} {#4} #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop } \cs_new:Npn \@@_map_Prepend_loop:Nnnw #1#2#3 #4 .. #5 , { \@@_if_q_recursion_tail_stop_do:nn {#4} { \@@_map_loop:Nnw #1 {#2#3} } \@@_codepoint_compare:nNnTF {#3} < { "#4 } { \@@_map_class_end:nw { \@@_map_loop:Nnw #1 {#2#3} } } { \@@_codepoint_compare:nNnTF {#3} > { "#5 } { \@@_map_Prepend_loop:Nnnw #1 {#2} {#3} } { \@@_map_class_end:nw { \@@_map_loop:Nnw #1 {#2} #3 } } } } % \end{macrocode} % Dealing with end-of-class is done such that we can be flexible. % \begin{macrocode} \cs_new:Npn \@@_map_not_Control:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { Extend } } \cs_new:Npn \@@_map_not_Extend:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { SpacingMark } } \cs_new:Npn \@@_map_not_SpacingMark:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { Prepend } } \cs_new:Npn \@@_map_not_Prepend:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { L } } \cs_new:Npn \@@_map_not_L:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { LV } } \cs_new:Npn \@@_map_not_LV:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { V } } \cs_new:Npn \@@_map_not_V:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { LVT } } \cs_new:Npn \@@_map_not_LVT:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { T } } \cs_new:Npn \@@_map_not_T:Nnn #1#2#3 { \@@_map_class:Nnnn #1 {#2} {#3} { Regional_Indicator } } \cs_new:Npn \@@_map_not_Regional_Indicator:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_loop:Nnw #1 {#3} } % \end{macrocode} % Hangul needs additional treatment. First we have to deal with % the start-of-Hangul position: output what we had up to now, then % move the specialist handler. The idea here is to pick off the % different codepoint types one at a time, tracking what else can be % considered at each stage until we hit the end of the viable types. % Other than that, we just keep building up the Hangul codepoints % using a dedicated version of the loop from above. % \begin{macrocode} \cs_new:Npn \@@_map_L:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_hangul:Nnnw #1 {#3} { L ; V ; LV ; LVT } } \cs_new:Npn \@@_map_LV:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_hangul:Nnnw #1 {#3} { V ; T } } \cs_new_eq:NN \@@_map_V:Nnn \@@_map_LV:Nnn \cs_new:Npn \@@_map_LVT:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_hangul:Nnnw #1 {#3} { T } } \cs_new_eq:NN \@@_map_T:Nnn \@@_map_LVT:Nnn \cs_new:Npn \@@_map_hangul:Nnnw #1#2#3#4 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#4} { \@@_map_hangul:NnnN #1 {#2} {#3} } { #1 {#2} \@@_map_loop:Nnw #1 { } } #4 \q_@@_recursion_stop } \cs_new:Npn \@@_map_hangul:NnnN #1#2#3#4 { \@@_if_q_recursion_tail_stop_do:Nn #4 { #1 {#2} \text_map_break: } \token_if_cs:NTF #4 { #1 {#2} \@@_map_loop:Nnw #1 { } } { \@@_codepoint_process:nN { \@@_map_hangul:Nnnn #1 {#2} {#3} } #4 } } \cs_new:Npn \@@_map_hangul:Nnnn #1#2#3#4 { \@@_map_hangul_aux:Nnnw #1 {#2} {#4} #3 ; \q_recursion_tail ; \q_recursion_stop } \cs_new:Npn \@@_map_hangul_aux:Nnnw #1#2#3#4 ; { \quark_if_recursion_tail_stop_do:nn {#4} { \@@_map_loop:Nnw #1 {#2} #3 } \exp_args:Nv \@@_map_hangul:nNnnnw { c_@@_grapheme_ #4 _clist } #1 {#2} {#3} {#4} } \cs_new:Npn \@@_map_hangul:nNnnnw #1#2#3#4#5#6 \q_recursion_stop { \@@_map_hangul_loop:Nnnnnw #2 {#3} {#4} {#5} {#6} #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop } \cs_new:Npn \@@_map_hangul_loop:Nnnnnw #1#2#3#4#5 #6 .. #7 , { \@@_if_q_recursion_tail_stop_do:nn {#6} { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} } \@@_codepoint_compare:nNnTF {#3} < { "#6 } { \@@_map_hangul_end:nw { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} } } { \@@_codepoint_compare:nNnTF {#3} > { "#7 } { \@@_map_hangul_loop:Nnnnnw #1 {#2} {#3} {#4} {#5} } { \@@_map_hangul_end:nw { \use:c { @@_map_hangul_ #4 :Nnn } #1 {#2} {#3} } } } } \cs_new:Npn \@@_map_hangul_next:Nnnn #1#2#3#4 { \@@_map_hangul_aux:Nnnw #1 {#2} {#3} #4 \q_recursion_stop } \cs_new:Npn \@@_map_hangul_end:nw #1#2 \q_@@_recursion_stop {#1} \cs_new:Npn \@@_map_hangul_L:Nnn #1#2#3 { \@@_map_hangul:Nnnw #1 {#2#3} { L V { LV } { LVT } } } \cs_new:Npn \@@_map_hangul_LV:Nnn #1#2#3 { \@@_map_hangul:Nnnw #1 {#2#3} { VT } } \cs_new_eq:NN \@@_map_hangul_V:Nnn \@@_map_hangul_LV:Nnn \cs_new:Npn \@@_map_hangul_LVT:Nnn #1#2#3 { \@@_map_hangul:Nnnw #1 {#2#3} { T } } \cs_new_eq:NN \@@_map_hangul_T:Nnn \@@_map_hangul_LVT:Nnn % \end{macrocode} % The Regional Indicator rule means looking ahead and dealing with the % case where there are two in a row. So we use a look ahead to pick them % off. As there is only one range the values are hard-coded. % \begin{macrocode} \cs_new:Npn \@@_map_Regional_Indicator:Nnn #1#2#3 { \@@_map_output:Nn #1 {#2} \@@_map_lookahead:NnNw #1 {#3} \@@_map_Regional_Indicator_aux:Nnn } \cs_new:Npn \@@_map_Regional_Indicator_aux:Nnn #1#2#3 { \bool_lazy_or:nnTF { \@@_codepoint_compare_p:nNn {#3} < { "1F1E6 } } { \@@_codepoint_compare_p:nNn {#3} > { "1F1FF } } { \@@_map_loop:Nnw #1 {#2} #3 } { \@@_map_loop:Nnw #1 {#2#3} } } % \end{macrocode} % A generic loop-ahead setup. % \begin{macrocode} \cs_new:Npn \@@_map_lookahead:NnNw #1#2#3#4 \q_@@_recursion_stop { \tl_if_head_is_N_type:nTF {#4} { \@@_map_lookahead:NnNN #1 {#2} #3 } { \@@_map_loop:Nnw #1 {#2} } #4 \q_@@_recursion_stop } \cs_new:Npn \@@_map_lookahead:NnNN #1#2#3#4 { \@@_if_q_recursion_tail_stop_do:Nn #4 { #1 {#2} } \token_if_cs:NTF #4 { #1 {#2} \@@_map_loop:Nnw #1 { } } { \@@_codepoint_process:nN { #3 #1 {#2} } } #4 } % \end{macrocode} % For the end of the process. % \begin{macrocode} \cs_new:Npn \@@_map_output:Nn #1#2 { \tl_if_blank:nF {#2} { #1 {#2} } } \cs_new:Npn \text_map_break: { \prg_map_break:Nn \text_map_break: { } } \cs_new:Npn \text_map_break:n { \prg_map_break:Nn \text_map_break: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\text_map_inline:nn} % The standard non-expandable inline version. % \begin{macrocode} \cs_new_protected:Npn \text_map_inline:nn #1#2 { \int_gincr:N \g__kernel_prg_map_int \cs_gset_protected:cpn { @@_map_ \int_use:N \g__kernel_prg_map_int :w } ##1 {#2} \exp_args:Nnc \text_map_function:nN {#1} { @@_map_ \int_use:N \g__kernel_prg_map_int :w } \prg_break_point:Nn \text_map_break: { \int_gdecr:N \g__kernel_prg_map_int } } % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex