Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
94.44% covered (success)
94.44%
102 / 108
81.82% covered (warning)
81.82%
9 / 11
CRAP
0.00% covered (danger)
0.00%
0 / 1
RegexStream
94.44% covered (success)
94.44%
102 / 108
81.82% covered (warning)
81.82%
9 / 11
41.29
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 nextToken
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
3
 splitTopLevelAlternation
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
9
 getIterator
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 createRepetition
84.38% covered (warning)
84.38%
27 / 32
0.00% covered (danger)
0.00%
0 / 1
11.46
 createStaticCharacterMarker
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 createEscapedCharacter
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 createStartMarker
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 createCaptureGroup
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
6
 createAnyMatch
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 createEndMarker
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2namespace Apie\RegexTools;
3
4use Apie\RegexTools\Parts\AnyMatch;
5use Apie\RegexTools\Parts\CaptureGroup;
6use Apie\RegexTools\Parts\EndOfRegex;
7use Apie\RegexTools\Parts\EscapedCharacter;
8use Apie\RegexTools\Parts\MatchOrMatch;
9use Apie\RegexTools\Parts\OptionalToken;
10use Apie\RegexTools\Parts\RegexPartInterface;
11use Apie\RegexTools\Parts\RepeatToken;
12use Apie\RegexTools\Parts\RepetitionToken;
13use Apie\RegexTools\Parts\StartOfRegex;
14use Apie\RegexTools\Parts\StaticCharacter;
15use IteratorAggregate;
16use Traversable;
17
18final class RegexStream implements IteratorAggregate
19{
20    const METHODMAP = [
21        '^' => 'createStartMarker',
22        '$' => 'createEndMarker',
23        '\\' => 'createEscapedCharacter',
24        '(' => 'createCaptureGroup',
25        '[' => 'createAnyMatch',
26    ];
27
28    private ?RegexPartInterface $previousPart = null;
29
30    private string $fullRegex;
31
32    public function __construct(
33        private string $regexToStream
34    ) {
35        $this->fullRegex = $regexToStream;
36    }
37
38    public function nextToken(): ?RegexPartInterface
39    {
40        $split = $this->splitTopLevelAlternation($this->regexToStream);
41        if ($split) {
42            [$left, $right] = $split;
43            $this->regexToStream = ''; // Consumed
44            return new MatchOrMatch(
45                iterator_to_array(new self($left)),
46                iterator_to_array(new self($right))
47            );
48        }
49        $firstCharacter = substr($this->regexToStream, 0, 1);
50        if ($firstCharacter === '') {
51            return null;
52        }
53        $method = self::METHODMAP[$firstCharacter] ?? 'createStaticCharacterMarker';
54        /** @var RegexPartInterface */
55        $part = $this->$method();
56        $this->regexToStream = substr($this->regexToStream, $part->getRegexStringLength());
57        $part = $this->createRepetition($part);
58        $this->previousPart = $part;
59
60        return $part;
61    }
62
63    /**
64     * In regular expression, | has a higher priority, as 'abc|def' is parsed as 'abc' or 'def'.
65     * Without it, it would parse it as a, then b, then c or d then e and f.
66     */
67    private function splitTopLevelAlternation(string $regex): ?array
68    {
69        $neededCharacters = [];
70        $length = strlen($regex);
71        for ($i = 0; $i < $length; $i++) {
72            $char = $regex[$i];
73            if ($char === '\\') {
74                $i++; // skip escaped character
75                continue;
76            }
77            if ($char === '(') {
78                $neededCharacters[] = ')';
79            } elseif ($char === '[') {
80                $neededCharacters[] = ']';
81            } elseif (!empty($neededCharacters) && $char === $neededCharacters[count($neededCharacters) - 1]) {
82                array_pop($neededCharacters);
83            } elseif ($char === '|' && empty($neededCharacters)) {
84                // Found top-level alternation
85                return [
86                    substr($regex, 0, $i),
87                    substr($regex, $i + 1)
88                ];
89            }
90        }
91        return null;
92    }
93
94    public function getIterator(): Traversable
95    {
96        return new RegexPartIterator($this->fullRegex);
97    }
98
99    private function createRepetition(RegexPartInterface $part): RegexPartInterface
100    {
101        $firstCharacter = substr($this->regexToStream, 0, 1);
102        if ($firstCharacter === '*') {
103            $this->regexToStream = substr($this->regexToStream, 1);
104            $part = new RepetitionToken($part);
105            return $this->createRepetition($part);
106        }
107        if ($firstCharacter === '+') {
108            $this->regexToStream = substr($this->regexToStream, 1);
109            $part = new RepetitionToken($part, true);
110            return $this->createRepetition($part);
111        }
112        if ($firstCharacter === '?') {
113            $this->regexToStream = substr($this->regexToStream, 1);
114            $part = new OptionalToken($part);
115            return $this->createRepetition($part);
116        }
117        if ($firstCharacter === '|') {
118            $part = new MatchOrMatch(
119                [$part],
120                iterator_to_array(new self(substr($this->regexToStream, 1)))
121            );
122            $this->regexToStream = '';
123        }
124        if ($firstCharacter === '{') {
125            if (preg_match('/^\{\s*(\d*)\s*,\s*(\d*)\s*\}/', $this->regexToStream, $matches)) {
126                $this->regexToStream = substr($this->regexToStream, strlen($matches[0]));
127                $minimum = $matches[1] === '' ? null : intval($matches[1]);
128                $maximum = $matches[2] === '' ? null : intval($matches[2]);
129                $part = new RepeatToken($part, $minimum, $maximum, $matches[0]);
130                return $this->createRepetition($part);
131            }
132            if (preg_match('/^\{\s*(\d*)\s*\}/', $this->regexToStream, $matches)) {
133                $this->regexToStream = substr($this->regexToStream, strlen($matches[0]));
134                $repeatCount = $matches[1] === '' ? null : intval($matches[1]);
135                $part = new RepeatToken($part, $repeatCount, $repeatCount, $matches[0]);
136                return $this->createRepetition($part);
137            }
138            // first character is { without } or invalid format => assume static {
139        }
140        return $part;
141    }
142
143    private function createStaticCharacterMarker(): RegexPartInterface
144    {
145        return new StaticCharacter(substr($this->regexToStream, 0, 1));
146    }
147
148    private function createEscapedCharacter(): RegexPartInterface
149    {
150        if (strlen($this->regexToStream) === 1) {
151            return new StaticCharacter('\\');
152        }
153
154        return new EscapedCharacter(substr($this->regexToStream, 1, 1));
155    }
156
157    private function createStartMarker(): RegexPartInterface
158    {
159        if ($this->previousPart) {
160            return $this->createStaticCharacterMarker();
161        }
162        return new StartOfRegex();
163    }
164
165    private function createCaptureGroup(): RegexPartInterface
166    {
167        $ptr = 1;
168        $counter = 1;
169        while ($ptr < strlen($this->regexToStream)) {
170            $character = substr($this->regexToStream, $ptr, 1);
171            if ($character === '\\') {
172                $ptr++;
173            }
174            $ptr++;
175            if ($character === ')') {
176                $counter--;
177                if ($counter === 0) {
178                    break;
179                }
180            } elseif ($character === '(') {
181                $counter++;
182            }
183        }
184        $insideCaptureGroup = substr($this->regexToStream, 1, $ptr - 2);
185        return new CaptureGroup(
186            iterator_to_array(new self($insideCaptureGroup))
187        );
188    }
189
190    private function createAnyMatch(): RegexPartInterface
191    {
192        $ptr = 1;
193        while ($ptr < strlen($this->regexToStream)) {
194            $character = substr($this->regexToStream, $ptr, 1);
195            if ($character === '\\') {
196                $ptr+=2;
197                continue;
198            }
199            $ptr++;
200            if ($character === ']') {
201                break;
202            }
203        }
204        $insideAnyMatch = substr($this->regexToStream, 1, $ptr - 2);
205        return new AnyMatch(
206            $insideAnyMatch
207        );
208    }
209
210    private function createEndMarker(): RegexPartInterface
211    {
212        return new EndOfRegex();
213    }
214}