Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.44% |
102 / 108 |
|
81.82% |
9 / 11 |
CRAP | |
0.00% |
0 / 1 |
RegexStream | |
94.44% |
102 / 108 |
|
81.82% |
9 / 11 |
41.29 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
nextToken | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 | |||
splitTopLevelAlternation | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
9 | |||
getIterator | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
createRepetition | |
84.38% |
27 / 32 |
|
0.00% |
0 / 1 |
11.46 | |||
createStaticCharacterMarker | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
createEscapedCharacter | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
createStartMarker | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
createCaptureGroup | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
6 | |||
createAnyMatch | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
createEndMarker | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | namespace Apie\RegexTools; |
3 | |
4 | use Apie\RegexTools\Parts\AnyMatch; |
5 | use Apie\RegexTools\Parts\CaptureGroup; |
6 | use Apie\RegexTools\Parts\EndOfRegex; |
7 | use Apie\RegexTools\Parts\EscapedCharacter; |
8 | use Apie\RegexTools\Parts\MatchOrMatch; |
9 | use Apie\RegexTools\Parts\OptionalToken; |
10 | use Apie\RegexTools\Parts\RegexPartInterface; |
11 | use Apie\RegexTools\Parts\RepeatToken; |
12 | use Apie\RegexTools\Parts\RepetitionToken; |
13 | use Apie\RegexTools\Parts\StartOfRegex; |
14 | use Apie\RegexTools\Parts\StaticCharacter; |
15 | use IteratorAggregate; |
16 | use Traversable; |
17 | |
18 | final class RegexStream implements IteratorAggregate |
19 | { |
20 | const METHODMAP = [ |
21 | '^' => 'createStartMarker', |
22 | '$' => 'createEndMarker', |
23 | '\\' => 'createEscapedCharacter', |
24 | '(' => 'createCaptureGroup', |
25 | '[' => 'createAnyMatch', |
26 | ]; |
27 | |
28 | private ?RegexPartInterface $previousPart = null; |
29 | |
30 | private string $fullRegex; |
31 | |
32 | public function __construct( |
33 | private string $regexToStream |
34 | ) { |
35 | $this->fullRegex = $regexToStream; |
36 | } |
37 | |
38 | public function nextToken(): ?RegexPartInterface |
39 | { |
40 | $split = $this->splitTopLevelAlternation($this->regexToStream); |
41 | if ($split) { |
42 | [$left, $right] = $split; |
43 | $this->regexToStream = ''; // Consumed |
44 | return new MatchOrMatch( |
45 | iterator_to_array(new self($left)), |
46 | iterator_to_array(new self($right)) |
47 | ); |
48 | } |
49 | $firstCharacter = substr($this->regexToStream, 0, 1); |
50 | if ($firstCharacter === '') { |
51 | return null; |
52 | } |
53 | $method = self::METHODMAP[$firstCharacter] ?? 'createStaticCharacterMarker'; |
54 | /** @var RegexPartInterface */ |
55 | $part = $this->$method(); |
56 | $this->regexToStream = substr($this->regexToStream, $part->getRegexStringLength()); |
57 | $part = $this->createRepetition($part); |
58 | $this->previousPart = $part; |
59 | |
60 | return $part; |
61 | } |
62 | |
63 | /** |
64 | * In regular expression, | has a higher priority, as 'abc|def' is parsed as 'abc' or 'def'. |
65 | * Without it, it would parse it as a, then b, then c or d then e and f. |
66 | */ |
67 | private function splitTopLevelAlternation(string $regex): ?array |
68 | { |
69 | $neededCharacters = []; |
70 | $length = strlen($regex); |
71 | for ($i = 0; $i < $length; $i++) { |
72 | $char = $regex[$i]; |
73 | if ($char === '\\') { |
74 | $i++; // skip escaped character |
75 | continue; |
76 | } |
77 | if ($char === '(') { |
78 | $neededCharacters[] = ')'; |
79 | } elseif ($char === '[') { |
80 | $neededCharacters[] = ']'; |
81 | } elseif (!empty($neededCharacters) && $char === $neededCharacters[count($neededCharacters) - 1]) { |
82 | array_pop($neededCharacters); |
83 | } elseif ($char === '|' && empty($neededCharacters)) { |
84 | // Found top-level alternation |
85 | return [ |
86 | substr($regex, 0, $i), |
87 | substr($regex, $i + 1) |
88 | ]; |
89 | } |
90 | } |
91 | return null; |
92 | } |
93 | |
94 | public function getIterator(): Traversable |
95 | { |
96 | return new RegexPartIterator($this->fullRegex); |
97 | } |
98 | |
99 | private function createRepetition(RegexPartInterface $part): RegexPartInterface |
100 | { |
101 | $firstCharacter = substr($this->regexToStream, 0, 1); |
102 | if ($firstCharacter === '*') { |
103 | $this->regexToStream = substr($this->regexToStream, 1); |
104 | $part = new RepetitionToken($part); |
105 | return $this->createRepetition($part); |
106 | } |
107 | if ($firstCharacter === '+') { |
108 | $this->regexToStream = substr($this->regexToStream, 1); |
109 | $part = new RepetitionToken($part, true); |
110 | return $this->createRepetition($part); |
111 | } |
112 | if ($firstCharacter === '?') { |
113 | $this->regexToStream = substr($this->regexToStream, 1); |
114 | $part = new OptionalToken($part); |
115 | return $this->createRepetition($part); |
116 | } |
117 | if ($firstCharacter === '|') { |
118 | $part = new MatchOrMatch( |
119 | [$part], |
120 | iterator_to_array(new self(substr($this->regexToStream, 1))) |
121 | ); |
122 | $this->regexToStream = ''; |
123 | } |
124 | if ($firstCharacter === '{') { |
125 | if (preg_match('/^\{\s*(\d*)\s*,\s*(\d*)\s*\}/', $this->regexToStream, $matches)) { |
126 | $this->regexToStream = substr($this->regexToStream, strlen($matches[0])); |
127 | $minimum = $matches[1] === '' ? null : intval($matches[1]); |
128 | $maximum = $matches[2] === '' ? null : intval($matches[2]); |
129 | $part = new RepeatToken($part, $minimum, $maximum, $matches[0]); |
130 | return $this->createRepetition($part); |
131 | } |
132 | if (preg_match('/^\{\s*(\d*)\s*\}/', $this->regexToStream, $matches)) { |
133 | $this->regexToStream = substr($this->regexToStream, strlen($matches[0])); |
134 | $repeatCount = $matches[1] === '' ? null : intval($matches[1]); |
135 | $part = new RepeatToken($part, $repeatCount, $repeatCount, $matches[0]); |
136 | return $this->createRepetition($part); |
137 | } |
138 | // first character is { without } or invalid format => assume static { |
139 | } |
140 | return $part; |
141 | } |
142 | |
143 | private function createStaticCharacterMarker(): RegexPartInterface |
144 | { |
145 | return new StaticCharacter(substr($this->regexToStream, 0, 1)); |
146 | } |
147 | |
148 | private function createEscapedCharacter(): RegexPartInterface |
149 | { |
150 | if (strlen($this->regexToStream) === 1) { |
151 | return new StaticCharacter('\\'); |
152 | } |
153 | |
154 | return new EscapedCharacter(substr($this->regexToStream, 1, 1)); |
155 | } |
156 | |
157 | private function createStartMarker(): RegexPartInterface |
158 | { |
159 | if ($this->previousPart) { |
160 | return $this->createStaticCharacterMarker(); |
161 | } |
162 | return new StartOfRegex(); |
163 | } |
164 | |
165 | private function createCaptureGroup(): RegexPartInterface |
166 | { |
167 | $ptr = 1; |
168 | $counter = 1; |
169 | while ($ptr < strlen($this->regexToStream)) { |
170 | $character = substr($this->regexToStream, $ptr, 1); |
171 | if ($character === '\\') { |
172 | $ptr++; |
173 | } |
174 | $ptr++; |
175 | if ($character === ')') { |
176 | $counter--; |
177 | if ($counter === 0) { |
178 | break; |
179 | } |
180 | } elseif ($character === '(') { |
181 | $counter++; |
182 | } |
183 | } |
184 | $insideCaptureGroup = substr($this->regexToStream, 1, $ptr - 2); |
185 | return new CaptureGroup( |
186 | iterator_to_array(new self($insideCaptureGroup)) |
187 | ); |
188 | } |
189 | |
190 | private function createAnyMatch(): RegexPartInterface |
191 | { |
192 | $ptr = 1; |
193 | while ($ptr < strlen($this->regexToStream)) { |
194 | $character = substr($this->regexToStream, $ptr, 1); |
195 | if ($character === '\\') { |
196 | $ptr+=2; |
197 | continue; |
198 | } |
199 | $ptr++; |
200 | if ($character === ']') { |
201 | break; |
202 | } |
203 | } |
204 | $insideAnyMatch = substr($this->regexToStream, 1, $ptr - 2); |
205 | return new AnyMatch( |
206 | $insideAnyMatch |
207 | ); |
208 | } |
209 | |
210 | private function createEndMarker(): RegexPartInterface |
211 | { |
212 | return new EndOfRegex(); |
213 | } |
214 | } |