-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2e1f632
Showing
6 changed files
with
400 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/vendor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
Nokogiri | ||
======== | ||
|
||
Cuts through XML like a breeze. | ||
|
||
Examples | ||
-------- | ||
|
||
Given this XML: | ||
|
||
```xml | ||
<p> | ||
<span>Lorem ipsum dolor <em>sit amet</em>.</span> | ||
</p> | ||
``` | ||
|
||
Cutting it at the twentieth character... | ||
|
||
```php | ||
$Nokogiri = new Nokogiri\Nokogiri(); | ||
$Nokogiri->cut($xml, 20); | ||
``` | ||
|
||
Would return: | ||
|
||
```xml | ||
<p> | ||
<span>Lorem ipsum dolor <em>sit</em>.</span> | ||
</p> | ||
``` | ||
|
||
Cutting it at the eleventh character... | ||
|
||
```php | ||
$Nokogiri->cut($xml, 11); | ||
``` | ||
|
||
Would return: | ||
|
||
```xml | ||
<p> | ||
<span>Lorem ipsum</span> | ||
</p> | ||
``` | ||
|
||
Note that the blank characters between tags are not taken into account. | ||
|
||
Pitfalls | ||
-------- | ||
|
||
For now, Nokogori won't work properly with string that are not enclosed | ||
in a proper tag, like that: | ||
|
||
```xml | ||
<p> | ||
Some unenclosed string | ||
<span>Lorem ipsum</span> | ||
</p> | ||
``` | ||
|
||
Notes | ||
----- | ||
|
||
The implementation is probably shitty, as I don't know anything about writing a | ||
decent parser... | ||
|
||
Also, the implementation of the parser itself is kind of tied to the class using | ||
it. It is obviously bad but it works :grin: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
"name": "fg/nokogiri", | ||
"description": "Cuts through XML like a breeze.", | ||
"version": "0.1.0", | ||
"authors": [{ | ||
"name": "Félix Girault", | ||
"email": "[email protected]", | ||
"homepage": "http://www.felix-girault.fr", | ||
"role": "Developer" | ||
}], | ||
"keywords": [ | ||
"xml", | ||
"cut", | ||
"truncate" | ||
], | ||
"license": "MIT", | ||
"homepage": "http://github.com/felixgirault/essence", | ||
"autoload": { | ||
"psr-4": { | ||
"Nokogiri\\": "src/" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
<?php | ||
|
||
namespace Nokogiri; | ||
|
||
use Nokogiri\Parser; | ||
|
||
|
||
|
||
/** | ||
* | ||
*/ | ||
class Nokogiri { | ||
|
||
/** | ||
* | ||
*/ | ||
public function cut($xml, $limit) { | ||
$Parser = new Parser(); | ||
$opened = []; | ||
$position = 0; | ||
|
||
$Parser->on( | ||
Parser::OPENED_TAG_EVENT, | ||
function($tag) use (&$opened) { | ||
$opened[] = $tag; | ||
} | ||
); | ||
|
||
$Parser->on( | ||
Parser::PARSED_TAG_CONTENTS_EVENT, | ||
function($contents, $i) use (&$opened, &$count, &$position, $limit) { | ||
if ($this->_isWhitespace($contents)) { | ||
return; | ||
} | ||
|
||
$count += strlen($contents); | ||
|
||
if ($count >= $limit) { | ||
$position = $i - ($count - $limit); | ||
return false; | ||
} | ||
} | ||
); | ||
|
||
$Parser->on( | ||
Parser::CLOSED_TAG_EVENT, | ||
function() use (&$opened) { | ||
array_pop($opened); | ||
} | ||
); | ||
|
||
$Parser->parse($xml); | ||
|
||
return $position | ||
? $this->_enclose($xml, $position, $opened) | ||
: $xml; | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _isWhitespace($string) { | ||
return preg_match('~\s+~i', $string); | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _enclose($xml, $position, $tags) { | ||
$xml = substr($xml, 0, $position); | ||
|
||
while ($tag = array_pop($tags)) { | ||
$xml .= "</$tag>"; | ||
} | ||
|
||
return $xml; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
<?php | ||
|
||
namespace Nokogiri; | ||
|
||
|
||
|
||
/** | ||
* | ||
*/ | ||
class Parser { | ||
|
||
// | ||
const OPENED_TAG_EVENT = 0; | ||
|
||
// | ||
const PARSED_TAG_CONTENTS_EVENT = 1; | ||
|
||
// | ||
const CLOSED_TAG_EVENT = 2; | ||
|
||
// | ||
const PARSING_OPENING_TAG = 0; | ||
|
||
// | ||
const PARSING_TAG_ATTRIBUTES = 1; | ||
|
||
// | ||
const PARSING_TAG_CONTENTS = 2; | ||
|
||
// | ||
const PARSING_CLOSING_TAG = 3; | ||
|
||
// | ||
protected $_observers = []; | ||
|
||
// | ||
protected $_continue = true; | ||
|
||
// | ||
protected $_contents = ''; | ||
|
||
// | ||
protected $_tagName = ''; | ||
|
||
// | ||
protected $_state = self::PARSING_TAG_CONTENTS; | ||
|
||
/** | ||
* | ||
*/ | ||
public function on($event, callable $callback) { | ||
$this->_observers[$event] = $callback; | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _emit($event) { | ||
if (isset($this->_observers[$event])) { | ||
$continue = call_user_func_array( | ||
$this->_observers[$event], | ||
array_slice(func_get_args(), 1) | ||
); | ||
|
||
$this->_continue = ($continue !== false); | ||
} | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
public function parse($xml) { | ||
for ($i = 0; $i < strlen($xml); $i++) { | ||
$char = $xml[$i]; | ||
|
||
switch ($this->_state) { | ||
case self::PARSING_OPENING_TAG: | ||
$this->_parseOpeningTag($char); | ||
break; | ||
|
||
case self::PARSING_TAG_ATTRIBUTES: | ||
$this->_parseTagAttributes($char); | ||
break; | ||
|
||
case self::PARSING_TAG_CONTENTS: | ||
$this->_parseTagContents($char, $i); | ||
break; | ||
|
||
case self::PARSING_CLOSING_TAG: | ||
$this->_parseClosingTag($char); | ||
break; | ||
} | ||
|
||
if (!$this->_continue) { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _parseOpeningTag($char) { | ||
switch ($char) { | ||
// we're in fact parsing a closing tag | ||
case '/': | ||
$this->_state = self::PARSING_CLOSING_TAG; | ||
break; | ||
|
||
// we're beggining to parse attributes | ||
// we know the name of the tag we just opened | ||
case ' ': | ||
$this->_state = self::PARSING_TAG_ATTRIBUTES; | ||
$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName); | ||
$this->_tagName = ''; | ||
break; | ||
|
||
// we're reaching the end of an opening tag | ||
// we know the name of the tag we just opened | ||
// we can begin to store the tag's contents for later use | ||
case '>': | ||
$this->_state = self::PARSING_TAG_CONTENTS; | ||
$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName); | ||
$this->_tagName = ''; | ||
break; | ||
|
||
// we're storing the tag's name for later use | ||
default: | ||
$this->_tagName .= $char; | ||
break; | ||
} | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _parseTagAttributes($char) { | ||
switch ($char) { | ||
// we're reaching the end of an opening tag | ||
// we can begin to store the tag's contents for later use | ||
case '>': | ||
$this->_state = self::PARSING_TAG_CONTENTS; | ||
$this->_contents = ''; | ||
break; | ||
} | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _parseTagContents($char, $i) { | ||
switch ($char) { | ||
// we're reaching the start of a tag | ||
// we can begin to store the tag's name for later use | ||
case '<': | ||
$this->_state = self::PARSING_OPENING_TAG; | ||
$this->_emit(self::PARSED_TAG_CONTENTS_EVENT, $this->_contents, $i); | ||
$this->_contents = ''; | ||
break; | ||
|
||
// we're storing the tag's contents for later use | ||
default: | ||
$this->_contents .= $char; | ||
break; | ||
} | ||
} | ||
|
||
/** | ||
* | ||
*/ | ||
protected function _parseClosingTag($char) { | ||
switch ($char) { | ||
// we're reaching the end of a closing tag | ||
case '>': | ||
$this->_state = self::PARSING_TAG_CONTENTS; | ||
$this->_emit(self::CLOSED_TAG_EVENT); | ||
break; | ||
} | ||
} | ||
} |
Oops, something went wrong.