Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
94.44% |
102 / 108 |
|
81.82% |
9 / 11 |
CRAP | |
0.00% |
0 / 1 |
| RegexStream | |
94.44% |
102 / 108 |
|
81.82% |
9 / 11 |
41.29 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| nextToken | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 | |||
| splitTopLevelAlternation | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
9 | |||
| getIterator | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| createRepetition | |
84.38% |
27 / 32 |
|
0.00% |
0 / 1 |
11.46 | |||
| createStaticCharacterMarker | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| createEscapedCharacter | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| createStartMarker | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
| createCaptureGroup | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
6 | |||
| createAnyMatch | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
| createEndMarker | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | namespace Apie\RegexTools; |
| 3 | |
| 4 | use Apie\RegexTools\Parts\AnyMatch; |
| 5 | use Apie\RegexTools\Parts\CaptureGroup; |
| 6 | use Apie\RegexTools\Parts\EndOfRegex; |
| 7 | use Apie\RegexTools\Parts\EscapedCharacter; |
| 8 | use Apie\RegexTools\Parts\MatchOrMatch; |
| 9 | use Apie\RegexTools\Parts\OptionalToken; |
| 10 | use Apie\RegexTools\Parts\RegexPartInterface; |
| 11 | use Apie\RegexTools\Parts\RepeatToken; |
| 12 | use Apie\RegexTools\Parts\RepetitionToken; |
| 13 | use Apie\RegexTools\Parts\StartOfRegex; |
| 14 | use Apie\RegexTools\Parts\StaticCharacter; |
| 15 | use IteratorAggregate; |
| 16 | use Traversable; |
| 17 | |
| 18 | final class RegexStream implements IteratorAggregate |
| 19 | { |
| 20 | const METHODMAP = [ |
| 21 | '^' => 'createStartMarker', |
| 22 | '$' => 'createEndMarker', |
| 23 | '\\' => 'createEscapedCharacter', |
| 24 | '(' => 'createCaptureGroup', |
| 25 | '[' => 'createAnyMatch', |
| 26 | ]; |
| 27 | |
| 28 | private ?RegexPartInterface $previousPart = null; |
| 29 | |
| 30 | private string $fullRegex; |
| 31 | |
| 32 | public function __construct( |
| 33 | private string $regexToStream |
| 34 | ) { |
| 35 | $this->fullRegex = $regexToStream; |
| 36 | } |
| 37 | |
| 38 | public function nextToken(): ?RegexPartInterface |
| 39 | { |
| 40 | $split = $this->splitTopLevelAlternation($this->regexToStream); |
| 41 | if ($split) { |
| 42 | [$left, $right] = $split; |
| 43 | $this->regexToStream = ''; // Consumed |
| 44 | return new MatchOrMatch( |
| 45 | iterator_to_array(new self($left)), |
| 46 | iterator_to_array(new self($right)) |
| 47 | ); |
| 48 | } |
| 49 | $firstCharacter = substr($this->regexToStream, 0, 1); |
| 50 | if ($firstCharacter === '') { |
| 51 | return null; |
| 52 | } |
| 53 | $method = self::METHODMAP[$firstCharacter] ?? 'createStaticCharacterMarker'; |
| 54 | /** @var RegexPartInterface */ |
| 55 | $part = $this->$method(); |
| 56 | $this->regexToStream = substr($this->regexToStream, $part->getRegexStringLength()); |
| 57 | $part = $this->createRepetition($part); |
| 58 | $this->previousPart = $part; |
| 59 | |
| 60 | return $part; |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * In regular expression, | has a higher priority, as 'abc|def' is parsed as 'abc' or 'def'. |
| 65 | * Without it, it would parse it as a, then b, then c or d then e and f. |
| 66 | */ |
| 67 | private function splitTopLevelAlternation(string $regex): ?array |
| 68 | { |
| 69 | $neededCharacters = []; |
| 70 | $length = strlen($regex); |
| 71 | for ($i = 0; $i < $length; $i++) { |
| 72 | $char = $regex[$i]; |
| 73 | if ($char === '\\') { |
| 74 | $i++; // skip escaped character |
| 75 | continue; |
| 76 | } |
| 77 | if ($char === '(') { |
| 78 | $neededCharacters[] = ')'; |
| 79 | } elseif ($char === '[') { |
| 80 | $neededCharacters[] = ']'; |
| 81 | } elseif (!empty($neededCharacters) && $char === $neededCharacters[count($neededCharacters) - 1]) { |
| 82 | array_pop($neededCharacters); |
| 83 | } elseif ($char === '|' && empty($neededCharacters)) { |
| 84 | // Found top-level alternation |
| 85 | return [ |
| 86 | substr($regex, 0, $i), |
| 87 | substr($regex, $i + 1) |
| 88 | ]; |
| 89 | } |
| 90 | } |
| 91 | return null; |
| 92 | } |
| 93 | |
| 94 | public function getIterator(): Traversable |
| 95 | { |
| 96 | return new RegexPartIterator($this->fullRegex); |
| 97 | } |
| 98 | |
| 99 | private function createRepetition(RegexPartInterface $part): RegexPartInterface |
| 100 | { |
| 101 | $firstCharacter = substr($this->regexToStream, 0, 1); |
| 102 | if ($firstCharacter === '*') { |
| 103 | $this->regexToStream = substr($this->regexToStream, 1); |
| 104 | $part = new RepetitionToken($part); |
| 105 | return $this->createRepetition($part); |
| 106 | } |
| 107 | if ($firstCharacter === '+') { |
| 108 | $this->regexToStream = substr($this->regexToStream, 1); |
| 109 | $part = new RepetitionToken($part, true); |
| 110 | return $this->createRepetition($part); |
| 111 | } |
| 112 | if ($firstCharacter === '?') { |
| 113 | $this->regexToStream = substr($this->regexToStream, 1); |
| 114 | $part = new OptionalToken($part); |
| 115 | return $this->createRepetition($part); |
| 116 | } |
| 117 | if ($firstCharacter === '|') { |
| 118 | $part = new MatchOrMatch( |
| 119 | [$part], |
| 120 | iterator_to_array(new self(substr($this->regexToStream, 1))) |
| 121 | ); |
| 122 | $this->regexToStream = ''; |
| 123 | } |
| 124 | if ($firstCharacter === '{') { |
| 125 | if (preg_match('/^\{\s*(\d*)\s*,\s*(\d*)\s*\}/', $this->regexToStream, $matches)) { |
| 126 | $this->regexToStream = substr($this->regexToStream, strlen($matches[0])); |
| 127 | $minimum = $matches[1] === '' ? null : intval($matches[1]); |
| 128 | $maximum = $matches[2] === '' ? null : intval($matches[2]); |
| 129 | $part = new RepeatToken($part, $minimum, $maximum, $matches[0]); |
| 130 | return $this->createRepetition($part); |
| 131 | } |
| 132 | if (preg_match('/^\{\s*(\d*)\s*\}/', $this->regexToStream, $matches)) { |
| 133 | $this->regexToStream = substr($this->regexToStream, strlen($matches[0])); |
| 134 | $repeatCount = $matches[1] === '' ? null : intval($matches[1]); |
| 135 | $part = new RepeatToken($part, $repeatCount, $repeatCount, $matches[0]); |
| 136 | return $this->createRepetition($part); |
| 137 | } |
| 138 | // first character is { without } or invalid format => assume static { |
| 139 | } |
| 140 | return $part; |
| 141 | } |
| 142 | |
| 143 | private function createStaticCharacterMarker(): RegexPartInterface |
| 144 | { |
| 145 | return new StaticCharacter(substr($this->regexToStream, 0, 1)); |
| 146 | } |
| 147 | |
| 148 | private function createEscapedCharacter(): RegexPartInterface |
| 149 | { |
| 150 | if (strlen($this->regexToStream) === 1) { |
| 151 | return new StaticCharacter('\\'); |
| 152 | } |
| 153 | |
| 154 | return new EscapedCharacter(substr($this->regexToStream, 1, 1)); |
| 155 | } |
| 156 | |
| 157 | private function createStartMarker(): RegexPartInterface |
| 158 | { |
| 159 | if ($this->previousPart) { |
| 160 | return $this->createStaticCharacterMarker(); |
| 161 | } |
| 162 | return new StartOfRegex(); |
| 163 | } |
| 164 | |
| 165 | private function createCaptureGroup(): RegexPartInterface |
| 166 | { |
| 167 | $ptr = 1; |
| 168 | $counter = 1; |
| 169 | while ($ptr < strlen($this->regexToStream)) { |
| 170 | $character = substr($this->regexToStream, $ptr, 1); |
| 171 | if ($character === '\\') { |
| 172 | $ptr++; |
| 173 | } |
| 174 | $ptr++; |
| 175 | if ($character === ')') { |
| 176 | $counter--; |
| 177 | if ($counter === 0) { |
| 178 | break; |
| 179 | } |
| 180 | } elseif ($character === '(') { |
| 181 | $counter++; |
| 182 | } |
| 183 | } |
| 184 | $insideCaptureGroup = substr($this->regexToStream, 1, $ptr - 2); |
| 185 | return new CaptureGroup( |
| 186 | iterator_to_array(new self($insideCaptureGroup)) |
| 187 | ); |
| 188 | } |
| 189 | |
| 190 | private function createAnyMatch(): RegexPartInterface |
| 191 | { |
| 192 | $ptr = 1; |
| 193 | while ($ptr < strlen($this->regexToStream)) { |
| 194 | $character = substr($this->regexToStream, $ptr, 1); |
| 195 | if ($character === '\\') { |
| 196 | $ptr+=2; |
| 197 | continue; |
| 198 | } |
| 199 | $ptr++; |
| 200 | if ($character === ']') { |
| 201 | break; |
| 202 | } |
| 203 | } |
| 204 | $insideAnyMatch = substr($this->regexToStream, 1, $ptr - 2); |
| 205 | return new AnyMatch( |
| 206 | $insideAnyMatch |
| 207 | ); |
| 208 | } |
| 209 | |
| 210 | private function createEndMarker(): RegexPartInterface |
| 211 | { |
| 212 | return new EndOfRegex(); |
| 213 | } |
| 214 | } |