Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 9 additions & 268 deletions Classes/Service/CleanHtmlService.php
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public function clean(string $html, array $config = []): string
}

// convert line-breaks to UNIX
$this->convNlOs($html);
$html = preg_replace('/\r\n|\r/', $this->newline, $html);

$manipulations = [];

Expand All @@ -116,16 +116,17 @@ public function clean(string $html, array $config = []): string
$manipulations['removeComments'] = GeneralUtility::makeInstance(RemoveComments::class);
}

if (!empty($this->headerComment)) {
$this->includeHeaderComment($html);
}

foreach ($manipulations as $key => $manipulation) {
/** @var ManipulationInterface $manipulation */
$configuration = isset($config[$key . '.']) && \is_array($config[$key . '.']) ? $config[$key . '.'] : [];
$html = $manipulation->manipulate($html, $configuration);
}

// include configured header comment in HTML content block
if (!empty($this->headerComment)) {
$html = preg_replace('/^(-->)$/m', "\n\t" . $this->headerComment . "\n$1", $html, 1);
}

// cleanup HTML5 self-closing elements
if (!isset($GLOBALS['TSFE']->config['config']['doctype'])
|| 'x' !== substr($GLOBALS['TSFE']->config['config']['doctype'], 0, 1)) {
Expand All @@ -136,276 +137,16 @@ public function clean(string $html, array $config = []): string
);
}

if ($this->formatType > 0) {
$html = $this->formatHtml($html);
if ($this->formatType) {
$indenter = new \Gajus\Dindent\Indenter(['indentation_character' => $this->tab]);
$html = $indenter->indent($html);
}

// remove white space after line ending
$this->rTrimLines($html);

// recover line-breaks
if (Environment::isWindows()) {
$html = str_replace($this->newline, "\r\n", $html);
}

return (string) $html;
}

/**
* Formats the (X)HTML code:
* - taps according to the hirarchy of the tags
* - removes empty spaces between tags
* - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..)
* choose from five options:
* 0 => off
* 1 => no line break at all (code in one line)
* 2 => minimalistic line breaks (structure defining box-elements)
* 3 => aesthetic line breaks (important box-elements)
* 4 => logic line breaks (all box-elements)
* 5 => max line breaks (all elements).
*/
protected function formatHtml(string $html): string
{
// Save original formated pre, textarea, comments, styles and scripts & replace them with markers
preg_match_all(
'/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
$html,
$matches
);
$noFormat = $matches[0]; // do not format these block elements
for ($i = 0; $i < \count($noFormat); ++$i) {
$html = str_replace($noFormat[$i], "\n<!-- ELEMENT {$i} -->", $html);
}

// define box elements for formatting
$trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section';
$functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup';
$usableBoxElements = 'applet|button|del|iframe|ins|map|object|script';
$imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--';
$allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')';
$esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)';
$structureBoxLikeElements = '(?>html|head|body|div|!--)';

// split html into it's elements
$htmlArrayTemp = preg_split(
'/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/',
$html,
-1,
\PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
);

if (false === $htmlArrayTemp) {
// Restore saved comments, styles and scripts
for ($i = 0; $i < \count($noFormat); ++$i) {
$html = str_replace("<!-- ELEMENT {$i} -->", $noFormat[$i], $html);
}

return $html;
}
// remove empty lines
$htmlArray = [''];
$index = 1;
for ($x = 0; $x < \count($htmlArrayTemp); ++$x) {
$text = trim($htmlArrayTemp[$x]);
$htmlArray[$index] = '' !== $text ? $htmlArrayTemp[$x] : $this->emptySpaceChar;
++$index;
}

// rebuild html
$html = '';
$tabs = 0;
for ($x = 0; $x < \count($htmlArray); ++$x) {
$htmlArrayBefore = $htmlArray[$x - 1] ?? '';
$htmlArrayCurrent = $htmlArray[$x] ?? '';

// check if the element should stand in a new line
$newline = false;
if ('<?xml' == substr($htmlArrayBefore, 0, 5)) {
$newline = true;
} elseif (2 == $this->formatType && ( // minimalistic line break
// this element has a line break before itself
preg_match(
'/<' . $structureBoxLikeElements . '(.*)>/Usi',
$htmlArrayCurrent
) || preg_match(
'/<' . $structureBoxLikeElements . '(.*) \/>/Usi',
$htmlArrayCurrent
) // one element before is a element that has a line break after
|| preg_match(
'/<\/' . $structureBoxLikeElements . '(.*)>/Usi',
$htmlArrayBefore
) || '<!--' == substr(
$htmlArrayBefore,
0,
4
) || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $htmlArrayBefore))
) {
$newline = true;
} elseif (3 == $this->formatType && ( // aestetic line break
// this element has a line break before itself
preg_match(
'/<' . $esteticBoxLikeElements . '(.*)>/Usi',
$htmlArrayCurrent
) || preg_match(
'/<' . $esteticBoxLikeElements . '(.*) \/>/Usi',
$htmlArrayCurrent
) // one element before is a element that has a line break after
|| preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $htmlArrayBefore) || '<!--' == substr(
$htmlArrayBefore,
0,
4
) || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $htmlArrayBefore))
) {
$newline = true;
} elseif ($this->formatType >= 4 && ( // logical line break
// this element has a line break before itself
preg_match(
'/<' . $allBoxLikeElements . '(.*)>/Usi',
$htmlArrayCurrent
) || preg_match(
'/<' . $allBoxLikeElements . '(.*) \/>/Usi',
$htmlArrayCurrent
) // one element before is a element that has a line break after
|| preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $htmlArrayBefore) || '<!--' == substr(
$htmlArrayBefore,
0,
4
) || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $htmlArrayBefore))
) {
$newline = true;
}

// count down a tab
if ('</' == substr($htmlArrayCurrent, 0, 2)) {
--$tabs;
}

// add tabs and line breaks in front of the current tag
if ($newline) {
$html .= $this->newline;
for ($y = 0; $y < $tabs; ++$y) {
$html .= $this->tab;
}
}

// remove white spaces and line breaks and add current tag to the html-string
if ('<![CDATA[' == substr($htmlArrayCurrent, 0, 9) // remove multiple white space in CDATA / XML
|| '<?xml' == substr($htmlArrayCurrent, 0, 5)
) {
$html .= $this->killWhiteSpace($htmlArrayCurrent);
} else { // remove all line breaks
$html .= $this->killLineBreaks($htmlArrayCurrent);
}

// count up a tab
if ('<' == substr($htmlArrayCurrent, 0, 1) && '/' != substr($htmlArrayCurrent, 1, 1) && '/>' != substr($htmlArrayCurrent, -2)) {
if (' ' !== substr($htmlArrayCurrent, 1, 1)
&& 'img' !== substr($htmlArrayCurrent, 1, 3)
&& 'source' !== substr($htmlArrayCurrent, 1, 6)
&& 'br' !== substr($htmlArrayCurrent, 1, 2)
&& 'hr' !== substr($htmlArrayCurrent, 1, 2)
&& 'input' !== substr($htmlArrayCurrent, 1, 5)
&& 'link' !== substr($htmlArrayCurrent, 1, 4)
&& 'meta' !== substr($htmlArrayCurrent, 1, 4)
&& 'col ' !== substr($htmlArrayCurrent, 1, 4)
&& 'frame' !== substr($htmlArrayCurrent, 1, 5)
&& 'isindex' !== substr($htmlArrayCurrent, 1, 7)
&& 'param' !== substr($htmlArrayCurrent, 1, 5)
&& 'area' !== substr($htmlArrayCurrent, 1, 4)
&& 'base' !== substr($htmlArrayCurrent, 1, 4)
&& '<!' !== substr($htmlArrayCurrent, 0, 2)
&& '<?xml' !== substr($htmlArrayCurrent, 0, 5)
) {
++$tabs;
}
}
}

// Remove empty lines
if ($this->formatType > 1) {
$this->removeEmptyLines($html);
}

// Restore saved comments, styles and scripts
for ($i = 0; $i < \count($noFormat); ++$i) {
$html = str_replace("<!-- ELEMENT {$i} -->", $noFormat[$i], $html);
}

// include debug comment at the end
if (0 != $tabs && true === $this->debugComment) {
$html .= "<!-- {$tabs} open elements found -->";
}

return $html;
}

/**
* Remove ALL line breaks and multiple white space.
*/
protected function killLineBreaks(string $html): string
{
$html = str_replace($this->newline, '', $html);

return preg_replace('/\s\s+/u', ' ', $html);
// ? return preg_replace('/\n|\s+(\s)/u', '$1', $html);
}

/**
* Remove multiple white space, keeps line breaks.
*/
protected function killWhiteSpace(string $html): string
{
$temp = explode($this->newline, $html);
for ($i = 0; $i < \count($temp); ++$i) {
if (!trim($temp[$i])) {
unset($temp[$i]);
continue;
}

$temp[$i] = trim($temp[$i]);
$temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]);
}

return implode($this->newline, $temp);
}

/**
* Remove white space at the end of lines, keeps other white space and line breaks.
*/
protected function rTrimLines(string &$html): void
{
$html = preg_replace('/\s+$/m', '', $html);
}

/**
* Convert newlines according to the current OS.
*/
protected function convNlOs(string &$html): void
{
$html = preg_replace("(\r\n|\r)", $this->newline, $html);
}

/**
* Remove empty lines.
*/
protected function removeEmptyLines(string &$html): void
{
$temp = explode($this->newline, $html);
$result = [];
for ($i = 0; $i < \count($temp); ++$i) {
if ('' == trim($temp[$i])) {
continue;
}
$result[] = $temp[$i];
}
$html = implode($this->newline, $result);
}

/**
* Include configured header comment in HTML content block.
*/
public function includeHeaderComment(string &$html): void
{
$html = preg_replace('/^(-->)$/m', "\n\t" . $this->headerComment . "\n$1", $html);
}
}
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,3 @@ composer install jweiland/replacer
|:----------------------------------|:------------|:-----------------------------------------------------------------|:-------------------|
| svgstore.enabled | boolean | Is the SVG extract & merge enabled for this template | 1 |
| svgstore.fileSize | integer | Maximum file size of a SVG to include (in `[byte]`) | 50000 |

---
##### ToDo:
- Try external packages like https://github.com/ArjanSchouten/HtmlMinifier
16 changes: 10 additions & 6 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
"license": "GPL-2.0-or-later",
"require": {
"php": "^8.1",
"typo3/cms-core": "^12.4||^13.4"
"typo3/cms-core": "^12.4||^13.4",
"schleuse/dindent": "dev-feature/new-major"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why no real version instead of a feature branch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

testing; will remove [TMP] commit w/ new version (be ready soon) ref. Schleuse/dindent#6

},
"autoload": {
"psr-4": {
"HTML\\Sourceopt\\": "Classes/"
}
},
"replace": {
"maxserv/replacecontent": "*",
"typo3-ter/replacecontent": "*",
"typo3-ter/sourceopt": "self.version"
},
"require-dev": {
"typo3/testing-framework": "^8.2",
"friendsofphp/php-cs-fixer": "^3.3",
Expand All @@ -29,6 +25,14 @@
"TYPO3\\CMS\\Core\\Tests\\": ".Build/vendor/typo3/cms/typo3/sysext/core/Tests/"
}
},
"replace": {
"maxserv/replacecontent": "*",
"typo3-ter/replacecontent": "*",
"typo3-ter/sourceopt": "self.version"
},
"suggest": {
"lochmueller/staticfilecache": "A very flexible and very, very, very fast cache to TYPO3"
},
"config": {
"vendor-dir": ".Build/vendor",
"bin-dir": ".Build/bin",
Expand Down