% \iffalse %<*gobble> % $Id: seqsplit.dtx,v 1.3 2006/08/08 00:02:08 boris Exp $ % % Copyright 2006, Boris Veytsman % This work may be distributed and/or modified under the % conditions of the LaTeX Project Public License, either % version 1.3 of this license or (at your option) any % later version. % The latest version of the license is in % http://www.latex-project.org/lppl.txt % and version 1.3 or later is part of all distributions of % LaTeX version 2003/06/01 or later. % % This work has the LPPL maintenance status `maintained'. % % The Current Maintainer of this work is Boris Veytsman % % This work consists of the file seqsplit.dtx and the % derived files seqsplit.sty, seqsplit.dtx. % % \fi % \CheckSum{50} % % \changes{v0.1}{2006/08/07}{The first released version} % %% \CharacterTable %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z %% Digits \0\1\2\3\4\5\6\7\8\9 %% Exclamation \! Double quote \" Hash (number) \# %% Dollar \$ Percent \% Ampersand \& %% Acute accent \' Left paren \( Right paren \) %% Asterisk \* Plus \+ Comma \, %% Minus \- Point \. Solidus \/ %% Colon \: Semicolon \; Less than \< %% Equals \= Greater than \> Question mark \? %% Commercial at \@ Left bracket \[ Backslash \\ %% Right bracket \] Circumflex \^ Underscore \_ %% Grave accent \` Left brace \{ Vertical bar \| %% Right brace \} Tilde \~} % %\iffalse % \begin{macrocode} \documentclass{ltxdoc} \usepackage{array} \usepackage{url} \usepackage{seqsplit} \DoNotIndex{\NeedsTeXFormat, \ProvidesPackage, \def, \hspace} \DoNotIndex{\futurelet, \@gobble, \ifx, \else, \fi, \relax} \DoNotIndex{\ifmmode, \fi, \allowbreak} \PageIndex \CodelineIndex \RecordChanges \EnableCrossrefs \begin{document} \DocInput{seqsplit.dtx} \end{document} % \end{macrocode} % % \fi % \MakeShortVerb{|} % %\GetFileInfo{seqsplit.sty} % \title{Splitting Long Sequences of Letters (DNA, RNA, Proteins, % Etc.)\thanks{\copyright Boris Veytsman, 2006}} % \author{Boris Veytsman} % \date{\filedate, \fileversion} % \maketitle % % \begin{abstract} % Sometimes one needs to typeset long sentences of letters, which % should not have spaces between them (like letters in words), but % could be split between lines at any point, and without a % hyphenation character. This package provides a command for such % sequences. % \end{abstract} % % \tableofcontents % % \clearpage % %\section{Introduction} %\label{sec:intro} % % At a recent Practical\TeX{} conference (Practical\TeX-2006, Rutgers, % New Jersey, USA, \url{http://www.tug.org/practicaltex2006}) Klaus % H\"oppner asked, how one typesets long sequences like the ones % related to DNA code. Usually there is no space between letters, but % a sequence could be split at any point and continued on the next % line. The audience suggested several solutions to this problem. % One solution, for example, was to define a new language, where % hyphenation is possible at any point, and hyphenation character is % empty. However, this would require regeneration of all \TeX{} % formats, which might be not practical or even not possible. Another % solution, suggested, if my memory is right, by Peter Flynn, was to % scan the sequence and insert a breaking point after each letter. % This later approach is implemented in this package. % % % %\section{User Interface} %\label{sec:interface} % % %\subsection{Main Command} %\label{sec:command} % % \DescribeMacro{\seqsplit} % The main (and actually the only) command in this package is % |\seqsplit|. Its usage is very simple, for example to typeset the % gene HBB, related to sickle cell anaemia (actually, the % corresponding mRNA Reference Sequence), we use the following: % \begin{verbatim} % \seqsplit{% % acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatggtgcatc% % tgactcctgaggagaagtctgccgttactgccctgtggggcaaggtgaacgtggatgaag% % ttggtggtgaggccctgggcaggctgctggtggtctacccttggacccagaggttctttg% % agtcctttggggatctgtccactcctgatgctgttatgggcaaccctaaggtgaaggctc% % atggcaagaaagtgctcggtgcctttagtgatggcctggctcacctggacaacctcaagg% % gcacctttgccacactgagtgagctgcactgtgacaagctgcacgtggatcctgagaact% % tcaggctcctgggcaacgtgctggtctgtgtgctggcccatcactttggcaaagaattca% % ccccaccagtgcaggctgcctatcagaaagtggtggctggtgtggctaatgccctggccc% % acaagtatcactaagctcgctttcttgctgtccaatttctattaaaggttcctttgttcc% % ctaagtccaactactaaactgggggatattatgaagggccttgagcatctggattctgcc% % taataaaaaacatttattttcattgc}. % \end{verbatim} % which produces % \begin{quote} % \seqsplit{% % acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatggtgcatc% % tgactcctgaggagaagtctgccgttactgccctgtggggcaaggtgaacgtggatgaag% % ttggtggtgaggccctgggcaggctgctggtggtctacccttggacccagaggttctttg% % agtcctttggggatctgtccactcctgatgctgttatgggcaaccctaaggtgaaggctc% % atggcaagaaagtgctcggtgcctttagtgatggcctggctcacctggacaacctcaagg% % gcacctttgccacactgagtgagctgcactgtgacaagctgcacgtggatcctgagaact% % tcaggctcctgggcaacgtgctggtctgtgtgctggcccatcactttggcaaagaattca% % ccccaccagtgcaggctgcctatcagaaagtggtggctggtgtggctaatgccctggccc% % acaagtatcactaagctcgctttcttgctgtccaatttctattaaaggttcctttgttcc% % ctaagtccaactactaaactgggggatattatgaagggccttgagcatctggattctgcc% % taataaaaaacatttattttcattgc}. % \end{quote} % Note that the breaking points in the code (commented out by \%) have % nothing to do with the breaking points in the typeset sequence and % are introduced only for readability of the code. % % The corresponding protein sequence ($\beta$-globulin) is shorter: % \begin{verbatim} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % vkahgkkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{verbatim} % \begin{quote} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % vkahgkkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{quote} % % The command works in math mode as well: % \begin{verbatim} % $\pi = \seqsplit{% % 3. % 1415926535 8979323846 2643383279 5028841971 6939937510 % 5820974944 5923078164 0628620899 8628034825 3421170679 % 8214808651 3282306647 0938446095 5058223172 5359408128 % 4811174502 8410270193 8521105559 6446229489 5493038196 % 4428810975 6659334461 2847564823 3786783165 2712019091 % 4564856692 3460348610 4543266482 1339360726 0249141273 % 7245870066 0631558817 4881520920 9628292540 9171536436 % 7892590360 0113305305 4882046652 1384146951 9415116094 % 3305727036 5759591953 0921861173 8193261179 3105118548 % 0744623799 6274956735 1885752724 8912279381 8301194912 % 9833673362 4406566430 8602139494 6395224737 1907021798 % 6094370277 0539217176 2931767523 8467481846 7669405132 % 0005681271 4526356082 7785771342 7577896091 7363717872 % 1468440901 2249534301 4654958537 1050792279 6892589235} % \ldots$ % \end{verbatim} % \begin{quote} % $\pi = \seqsplit{% % 3. % 1415926535 8979323846 2643383279 5028841971 6939937510 % 5820974944 5923078164 0628620899 8628034825 3421170679 % 8214808651 3282306647 0938446095 5058223172 5359408128 % 4811174502 8410270193 8521105559 6446229489 5493038196 % 4428810975 6659334461 2847564823 3786783165 2712019091 % 4564856692 3460348610 4543266482 1339360726 0249141273 % 7245870066 0631558817 4881520920 9628292540 9171536436 % 7892590360 0113305305 4882046652 1384146951 9415116094 % 3305727036 5759591953 0921861173 8193261179 3105118548 % 0744623799 6274956735 1885752724 8912279381 8301194912 % 9833673362 4406566430 8602139494 6395224737 1907021798 % 6094370277 0539217176 2931767523 8467481846 7669405132 % 0005681271 4526356082 7785771342 7577896091 7363717872 % 1468440901 2249534301 4654958537 1050792279 6892589235} % \ldots$ % \end{quote} % %\subsection{Customization} %\label{sec:customization} % % \DescribeMacro{\seqinsert} The command |\seqsplit| can be customized % by redefining the command |\seqinsert|, which is the macro that is % inserted between the letters of the sequence. By default it is % defined as |\allowbreak| in math mode and |\hspace{0pt plus 0.02em}| % in text mode: a slightly stretchable glue of zero length. This % definition gives \TeX{} a chance to justify the lines. However, % there might be other definitions. For example, if we want hyphens % at the breakpoints in text mode, we can use: % \begin{quote} % |\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi}| % \end{quote} % which produces for the $\beta$-globulin protein from the previous % section the following: % \begin{quote} % \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % vkahgkkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{quote} % Another redefinition, % \begin{quote} % |\renewcommand{\seqinsert}{\ifmmode\allowbreak\else{} \fi}|, % \end{quote} % produces an output with spaces between letters. Note that there is % no space between the last letter and the dot: the package takes care % of this: % \begin{quote} % \renewcommand{\seqinsert}{\ifmmode\allowbreak\else{} \fi} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % vkahgkkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{quote} % % % %\subsection{Grouping and Commands} %\label{sec:grouping} % % The command |\seqsplit| does not insert breakpoints between the % letters inside braces |{...}|. Compare the typesetting of % $\beta$-globulin in Section~\ref{sec:command} and the following % example: % \begin{verbatim} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{kahg}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{verbatim} % \begin{quote} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{kahg}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvlvcvlahhfg% % keftppvqaayqkvvagvanalahkyh}. % \end{quote} % The braces around |{kahg}| prevented a splitting of this group. % This effect can be used for typesetting special substrings inside % sequences. % % The way |\seqsplit| works interferes with formatting commands like % |\textit|. Therefore the sequence |{kahg}| is \emph{not} italicized % in the following example: % \begin{verbatim} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v\textit{kahg}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{verbatim} % \begin{quote} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v\textit{kahg}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{quote} % % Using grouping |{\textit{kahg}}| we can save the situation: % \begin{verbatim} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{\textit{kahg}}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{verbatim} % \begin{quote} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{\textit{kahg}}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{quote} % % If we want the italicized sequence to be splittable as well, we can % use \emph{nested} |\seqsplit|: % \begin{verbatim} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{\textit{\seqsplit{kahg}}}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{verbatim} % \begin{quote} % \seqsplit{% % mvhltpeeksavtalwgkvnvdevggealgrllvvypwtqrffesfgdlstpdavmgnpk% % v{\textit{\seqsplit{kahg}}}kkvlgafsdglahldnlkgtfatlselhcdklhvdpenfrllgnvl% % vcvlahhfgkeftppvqaayqkvvagvanalahkyh}. % \end{quote} % % These tricks allow one to produce splittable sequences with a rather % complex formatting. % % %\StopEventually{} % % \clearpage % % \section{Implementation} % \label{sec:implementation} % % %\subsection{Declarations} %\label{sec:decl} % % We start with declaration, who we are: % % % \begin{macrocode} %<*style> \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{seqsplit} [2006/08/07 v0.1 Splitting long sequences (DNA, RNA, proteins, etc.) ] % \end{macrocode} % % % % %\subsection{Inserted Text} %\label{sec:insertion} % % % \begin{macro}{\seqinsert} % This is the macro we insert between letters: % \begin{macrocode} \def\seqinsert{\ifmmode\allowbreak\else\hspace{0pt plus 0.02em}\fi} % \end{macrocode} % \end{macro} % % % %\subsection{Scanner} %\label{sec:scanner} % % The scanner code is not too trivial. Here we describe it in detail. % % \begin{macro}{\seqsplit} % The main (actually, the only) user-space macro just starts the % scanner. % \begin{macrocode} \def\seqsplit#1{\SQSPL@scan#1\SQSPL@end} % \end{macrocode} % \end{macro} % % The macro |\SQSPL@end| is never expanded, it is just a marker. % \begin{macro}{\SQSPL@scan} % The macro |\SQSPL@scan| saves the next token in the special % register |\SQSPL@next|, so we can decide what to do with it: % \begin{macrocode} \def\SQSPL@scan{\futurelet\SQSPL@next\SQSPL@scani} % \end{macrocode} % \end{macro} % \begin{macro}{\SQSPL@scani} % Now since we know the next token, we can decide to either stop the % expansion if we met the end, or continue it if we did not. % \begin{macrocode} \def\SQSPL@scani#1{% \ifx \SQSPL@end \SQSPL@next \def\SQSPL@process{\@gobble}% \else \def\SQSPL@process{\SQSPL@doprocess}\fi% \SQSPL@process{#1}} % \end{macrocode} % \end{macro} % \begin{macro}{\SQSPL@doprocess} % The processing of a letter depends on what is the next letter. If % the sequence is finished, we should not insert anything after the % last letter: we do not want to break the line between the sequence % and, say, a comma. Therefore we insert a special smart macro: % \begin{macrocode} \def\SQSPL@doprocess#1{#1\SQSPL@insert} % \end{macrocode} % \end{macro} % \begin{macro}{\SQSPL@insert} % The macro |\SQSPL@insert| uses |\futurelet| to check whether the % processed letter is the last one in the sentence: % \begin{macrocode} \def\SQSPL@insert{\futurelet\SQSPL@next\SQSPL@doinsert} % \end{macrocode} % \end{macro} % \begin{macro}{\SQSPL@doinsert} % And this is the macro that inserts |\seqinsert| and continues % scanning: % \begin{macrocode} \def\SQSPL@doinsert{% \ifx \SQSPL@end \SQSPL@next \relax% \else \seqinsert \fi% \SQSPL@scan} % \end{macrocode} % \end{macro} % % %\subsection{The Last Words} %\label{sec:last} % % % % \begin{macrocode} % % \end{macrocode} %\Finale %\clearpage % %\PrintChanges %\clearpage %\PrintIndex % \endinput