Code Coverage
 
Classes and Traits
Functions and Methods
Lines
Total
0.00% covered (danger)
0.00%
0 / 1
48.28% covered (danger)
48.28%
14 / 29
CRAP
90.13% covered (success)
90.13%
566 / 628
Tokenizer
0.00% covered (danger)
0.00%
0 / 1
48.28% covered (danger)
48.28%
14 / 29
358.52
90.13% covered (success)
90.13%
566 / 628
 __construct
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
11 / 11
 setEnableCdataCallback
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
2 / 2
 execute
100.00% covered (success)
100.00%
1 / 1
3
100.00% covered (success)
100.00%
13 / 13
 getPreprocessedText
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
2 / 2
 switchState
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
3 / 3
 setFragmentContext
0.00% covered (danger)
0.00%
0 / 1
15.24
81.48% covered (warning)
81.48%
22 / 27
 beginStepping
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
4 / 4
 step
0.00% covered (danger)
0.00%
0 / 1
2.15
66.67% covered (warning)
66.67%
2 / 3
 preprocess
0.00% covered (danger)
0.00%
0 / 1
9.08
90.00% covered (success)
90.00%
18 / 20
 executeInternal
0.00% covered (danger)
0.00%
0 / 1
9
96.30% covered (success)
96.30%
26 / 27
 dataState
0.00% covered (danger)
0.00%
0 / 1
34
95.24% covered (success)
95.24%
100 / 105
 interpretCommentMatches
100.00% covered (success)
100.00%
1 / 1
17
100.00% covered (success)
100.00%
33 / 33
 interpretDoctypeMatches
100.00% covered (success)
100.00%
1 / 1
36
100.00% covered (success)
100.00%
53 / 53
 interpretDoctypeQuoted
100.00% covered (success)
100.00%
1 / 1
7
100.00% covered (success)
100.00%
15 / 15
 handleNulls
0.00% covered (danger)
0.00%
0 / 1
6.02
92.31% covered (success)
92.31%
12 / 13
 handleAsciiErrors
100.00% covered (success)
100.00%
1 / 1
5
100.00% covered (success)
100.00%
14 / 14
 handleCharRefs
0.00% covered (danger)
0.00%
0 / 1
46.32
94.67% covered (success)
94.67%
71 / 75
 emitDataRange
0.00% covered (danger)
0.00%
0 / 1
8.02
92.86% covered (success)
92.86%
13 / 14
 emitCdataRange
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
3 / 3
 emitRawTextRange
0.00% covered (danger)
0.00%
0 / 1
6.03
90.91% covered (success)
90.91%
10 / 11
 textElementState
0.00% covered (danger)
0.00%
0 / 1
5
95.00% covered (success)
95.00%
19 / 20
 consumeAttribs
0.00% covered (danger)
0.00%
0 / 1
4.04
86.67% covered (warning)
86.67%
13 / 15
 interpretAttribMatches
100.00% covered (success)
100.00%
1 / 1
31
100.00% covered (success)
100.00%
56 / 56
 handleAttribsAndClose
0.00% covered (danger)
0.00%
0 / 1
16.94
71.43% covered (warning)
71.43%
25 / 35
 plaintextState
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
2 / 2
 scriptDataState
0.00% covered (danger)
0.00%
0 / 1
7.26
82.61% covered (warning)
82.61%
19 / 23
 error
100.00% covered (success)
100.00%
1 / 1
3
100.00% covered (success)
100.00%
5 / 5
 fatal
0.00% covered (danger)
0.00%
0 / 1
2
0.00% covered (danger)
0.00%
0 / 1
 throwPregError
0.00% covered (danger)
0.00%
0 / 1
90
0.00% covered (danger)
0.00%
0 / 23
<?php
namespace RemexHtml\Tokenizer;
use RemexHtml\HTMLData;
use RemexHtml\PropGuard;
/**
 * HTML 5 tokenizer
 *
 * Based on the W3C recommendation as published 01 November 2016:
 * https://www.w3.org/TR/2016/REC-html51-20161101/
 */
class Tokenizer {
    use PropGuard;
    // States
    public const STATE_START = 1;
    public const STATE_DATA = 2;
    public const STATE_RCDATA = 3;
    public const STATE_RAWTEXT = 4;
    public const STATE_SCRIPT_DATA = 5;
    public const STATE_PLAINTEXT = 6;
    public const STATE_EOF = 7;
    public const STATE_CURRENT = 8;
    // Match indices for the data state regex
    private const MD_END_TAG_OPEN = 1;
    private const MD_TAG_NAME = 2;
    private const MD_TAG_AFTER_LOWERCASE = 3;
    private const MD_COMMENT = 4;
    private const MD_COMMENT_INNER = 5;
    private const MD_COMMENT_END = 6;
    private const MD_DOCTYPE = 7;
    private const MD_DT_NAME_WS = 8;
    private const MD_DT_NAME = 9;
    private const MD_DT_PUBLIC_WS = 10;
    private const MD_DT_PUBLIC_DQ = 11;
    private const MD_DT_PUBLIC_SQ = 12;
    private const MD_DT_PUBSYS_WS = 13;
    private const MD_DT_PUBSYS_DQ = 14;
    private const MD_DT_PUBSYS_SQ = 15;
    private const MD_DT_SYSTEM_WS = 16;
    private const MD_DT_SYSTEM_DQ = 17;
    private const MD_DT_SYSTEM_SQ = 18;
    private const MD_DT_BOGUS = 19;
    private const MD_DT_END = 20;
    private const MD_CDATA = 21;
    private const MD_BOGUS_COMMENT = 22;
    // Match indices for the character reference regex
    private const MC_PREFIX = 1;
    private const MC_DECIMAL = 2;
    private const MC_HEXDEC = 3;
    private const MC_SEMICOLON = 4;
    private const MC_HASH = 5;
    private const MC_NAMED = 6;
    private const MC_SUFFIX = 7;
    private const MC_INVALID = 8;
    // Match indices for the attribute regex
    private const MA_SLASH = 1;
    private const MA_NAME = 2;
    private const MA_SIMPLE_NAME = 3;
    private const MA_DQUOTED = 4;
    private const MA_DQUOTED_CHARREF = 5;
    private const MA_DQUOTED_UNSIMPLE = 6;
    private const MA_SQUOTED = 7;
    private const MA_SQUOTED_CHARREF = 8;
    private const MA_SQUOTED_UNSIMPLE = 9;
    private const MA_UNQUOTED = 10;
    private const MA_UNQUOTED_UNSIMPLE = 11;
    // Characters
    protected const REPLACEMENT_CHAR = "\xef\xbf\xbd";
    protected const BYTE_ORDER_MARK = "\xef\xbb\xbf";
    // A list of "common well-behaved entities", used to optimize fast paths
    private static $commonEntities = [
        '&amp;' => '&',
        '&apos;' => "'",
        '&lt;' => '<',
        '&gt;' => '>',
        '&quot;' => '"',
        '&nbsp;' => "\u{00A0}",
    ];
    protected $ignoreErrors;
    protected $ignoreCharRefs;
    protected $ignoreNulls;
    protected $skipPreprocess;
    protected $scriptingFlag;
    protected $appropriateEndTag;
    protected $listener;
    protected $state;
    protected $preprocessed;
    protected $text;
    protected $pos;
    protected $length;
    protected $enableCdataCallback;
    protected $fragmentNamespace;
    protected $fragmentName;
    /**
     * Constructor
     *
     * @param TokenHandler $listener The object which receives token events
     * @param string $text The text to tokenize
     * @param array $options Associative array of options, including:
     *   - ignoreErrors: True to improve performance by ignoring errors. The
     *     token stream should still be the same, except that error() won't be
     *     called.
     *   - ignoreCharRefs: True to ignore character references. Character tokens
     *     will contain the unexpanded character references, and no errors
     *     related to invalid character references will be raised. Performance
     *     will be improved. This is not compliant behaviour.
     *   - ignoreNulls: True to ignore NULL bytes in the input stream, instead
     *     of raising errors and converting them to U+FFFD as is usually
     *     required by the spec.
     *   - skipPreprocess: True to skip the "preprocessing the input stream"
     *     stage, which normalizes line endings and raises errors on certain
     *     control characters. Advisable if the input stream is already
     *     appropriately normalized.
     *   - scriptingFlag: True if the scripting flag is enabled. Default true.
     *     Setting this to false cause the contents of <noscript> elements to be
     *     processed as normal content. The scriptingFlag option in the
     *     TreeBuilder should be set to the same value.
     */
    public function __construct( TokenHandler $listener, $text, $options = [] ) {