From f95130b4c4d56d8eb5313867f053cf3cddc89e06 Mon Sep 17 00:00:00 2001 From: Philip Helger Date: Mon, 12 Jun 2023 16:47:09 +0200 Subject: [PATCH] Added unescaping on the CSSCharStream level --- .../com/helger/css/parser/CSSCharStream.java | 238 +++++++++++++++++- .../LoggingCSSParseErrorHandler.java | 42 +++- .../css/supplementary/issues/Issue91Test.java | 4 +- .../wiki/WikiCreateFontFaceRuleFuncTest.java | 11 +- 4 files changed, 275 insertions(+), 20 deletions(-) diff --git a/ph-css/src/main/java/com/helger/css/parser/CSSCharStream.java b/ph-css/src/main/java/com/helger/css/parser/CSSCharStream.java index 2abc3b66..27549379 100644 --- a/ph-css/src/main/java/com/helger/css/parser/CSSCharStream.java +++ b/ph-css/src/main/java/com/helger/css/parser/CSSCharStream.java @@ -23,8 +23,14 @@ import javax.annotation.Nonnegative; import javax.annotation.Nonnull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.helger.commons.ValueEnforcer; +import com.helger.commons.io.stream.NonBlockingPushbackReader; import com.helger.commons.io.stream.StreamHelper; +import com.helger.commons.string.StringHelper; +import com.helger.css.reader.errorhandler.LoggingCSSParseErrorHandler; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; @@ -40,9 +46,218 @@ @SuppressFBWarnings ("NM_METHOD_NAMING_CONVENTION") public final class CSSCharStream implements CharStream { + /** + * A special char iterator based on + * https://www.w3.org/TR/css-syntax-3/#css-filter-code-points + * + * @author Philip Helger + */ + private static final class CSSFilterCodePointsReader implements AutoCloseable + { + private static final Logger LOGGER = LoggerFactory.getLogger (CSSCharStream.CSSFilterCodePointsReader.class); + + private final NonBlockingPushbackReader m_aLocalReader; + + public CSSFilterCodePointsReader (@Nonnull final Reader aSrcReader) + { + // 1 char look ahead is sufficient + m_aLocalReader = new NonBlockingPushbackReader (aSrcReader, 1); + } + + public void close () throws IOException + { + m_aLocalReader.close (); + } + + /** + * @return Next character to come including pushing it back + */ + private int _lookaheadCodePoint () throws IOException + { + int ret = m_aLocalReader.read (); + m_aLocalReader.unread (ret); + + switch (ret) + { + case 0: + ret = (char) 0xfffd; + break; + case '\f': + ret = '\n'; + break; + case '\r': + // No matter if followed by \n or not + ret = '\n'; + break; + } + return ret; + } + + /** + * This is the method implementing + * https://www.w3.org/TR/css-syntax-3/#css-filter-code-points + * + * @return Next code point. May read 1 or 2 chars. + */ + private int _readFilteredCodePoint () throws IOException + { + // See + int ret = m_aLocalReader.read (); + switch (ret) + { + case 0: + // 0 means "unsupported character" + ret = (char) 0xfffd; + break; + case '\f': + // Form feed becomes \n + ret = '\n'; + break; + case '\r': + { + // Read next + final int next = m_aLocalReader.read (); + if (next == '\n') + { + // Handle \r\n as one \n + } + else + if (next != -1) + { + // Unread the char (except EOF) + m_aLocalReader.unread (next); + } + // \r and \r\n becomes \n + ret = '\n'; + break; + } + } + if (LOGGER.isTraceEnabled ()) + { + if (ret == -1) + LOGGER.trace ("Read EOF"); + else + LOGGER.trace ("Read " + LoggingCSSParseErrorHandler.createLoggingStringIllegalCharacter ((char) ret)); + } + return ret; + } + + private static boolean _isNewLine (final int c) + { + return c == '\n'; + } + + private static boolean _isWhitespace (final int c) + { + return _isNewLine (c) || c == '\t' || c == ' '; + } + + private static boolean _isHexChar (final int c) + { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + + // Handle https://www.w3.org/TR/css-syntax-3/#escaping + private int _handleUnescape (final int cSrcFiltered) throws IOException + { + if (cSrcFiltered != '\\') + { + // Return as is + return cSrcFiltered; + } + + // Check next char + int nCodePoint = 0; + int nHexCount = 0; + while (nHexCount < 6) + { + final int cNext = _lookaheadCodePoint (); + if (_isHexChar (cNext)) + { + nHexCount++; + // Consume char + _readFilteredCodePoint (); + nCodePoint = (nCodePoint * 16) + StringHelper.getHexValue ((char) cNext); + } + else + break; + } + + if (nHexCount == 0) + { + // Check if the next char is a newline + final int cNext = _lookaheadCodePoint (); + if (_isNewLine (cNext)) + { + // Consume newline char + _readFilteredCodePoint (); + // Return the code point following the newline + return _readFilteredCodePoint (); + } + + // Return the backslash as is + return cSrcFiltered; + } + + // Hex chars found + // Check for a trailing whitespace and evtl. skip it + final int cNext = _lookaheadCodePoint (); + if (_isWhitespace (cNext)) + { + // Consume char + _readFilteredCodePoint (); + } + + return nCodePoint; + } + + public int read (@Nonnull final char [] buf, @Nonnegative final int nOfs, @Nonnegative final int nLen) + throws IOException + { + ValueEnforcer.notNull (buf, "buf"); + ValueEnforcer.isGE0 (nOfs, "Ofs"); + ValueEnforcer.isGE0 (nLen, "Len"); + + if (LOGGER.isTraceEnabled ()) + LOGGER.trace ("## read (" + nOfs + ", " + nLen + ")"); + + int nCharsRead = 0; + int nDstPos = nOfs; + for (int i = 0; i < nLen; ++i) + { + final int c = _readFilteredCodePoint (); + if (c == -1) + { + // EOF + break; + } + + final int cCleanChar = _handleUnescape (c); + + if (cCleanChar <= Character.MAX_VALUE) + { + buf[nDstPos] = (char) cCleanChar; + nCharsRead++; + nDstPos++; + } + else + { + // TODO handle code points cleanly + LOGGER.warn ("Unsupported code point found: " + cCleanChar); + } + } + if (LOGGER.isTraceEnabled ()) + LOGGER.trace ("## read " + nCharsRead + " chars"); + + // -1 meaning EOF + return nCharsRead == 0 ? -1 : nCharsRead; + } + } + + public static final int DEFAULT_TAB_SIZE = 8; private static final int DEFAULT_BUF_SIZE = 4096; - private final Reader m_aReader; + private final CSSFilterCodePointsReader m_aReader; private int m_nLine; private int m_nColumn; private int m_nAvailable; @@ -62,7 +277,7 @@ public final class CSSCharStream implements CharStream /** Position in buffer. */ private int m_nBufpos = -1; - private int m_nTabSize = 8; + private int m_nTabSize = DEFAULT_TAB_SIZE; private boolean m_bTrackLineColumn = true; public CSSCharStream (@Nonnull final Reader aReader) @@ -75,11 +290,15 @@ private CSSCharStream (@Nonnull final Reader aReader, @Nonnegative final int nStartColumn, @Nonnegative final int nBufferSize) { + ValueEnforcer.notNull (aReader, "Reader"); + ValueEnforcer.isGE0 (nStartLine, "StartLine"); + ValueEnforcer.isGE0 (nStartColumn, "StartColumn"); ValueEnforcer.isGE0 (nBufferSize, "BufferSize"); + // Using a buffered reader gives a minimal speedup - m_aReader = StreamHelper.getBuffered (ValueEnforcer.notNull (aReader, "Reader")); - m_nLine = ValueEnforcer.isGE0 (nStartLine, "StartLine"); - m_nColumn = ValueEnforcer.isGE0 (nStartColumn, "StartColumn") - 1; + m_aReader = new CSSFilterCodePointsReader (StreamHelper.getBuffered (aReader)); + m_nLine = nStartLine; + m_nColumn = nStartColumn - 1; m_nAvailable = nBufferSize; m_nBufsize = nBufferSize; @@ -355,10 +574,13 @@ public void backup (final int nAmount) /** @return token image as String */ public String getImage () { + final String sImage; if (m_nBufpos >= m_nTokenBegin) - return new String (m_aBuffer, m_nTokenBegin, m_nBufpos - m_nTokenBegin + 1); - - return new String (m_aBuffer, m_nTokenBegin, m_nBufsize - m_nTokenBegin) + new String (m_aBuffer, 0, m_nBufpos + 1); + sImage = new String (m_aBuffer, m_nTokenBegin, m_nBufpos - m_nTokenBegin + 1); + else + sImage = new String (m_aBuffer, m_nTokenBegin, m_nBufsize - m_nTokenBegin) + + new String (m_aBuffer, 0, m_nBufpos + 1); + return sImage; } /** @return suffix */ diff --git a/ph-css/src/main/java/com/helger/css/reader/errorhandler/LoggingCSSParseErrorHandler.java b/ph-css/src/main/java/com/helger/css/reader/errorhandler/LoggingCSSParseErrorHandler.java index df6bbf42..0da72291 100644 --- a/ph-css/src/main/java/com/helger/css/reader/errorhandler/LoggingCSSParseErrorHandler.java +++ b/ph-css/src/main/java/com/helger/css/reader/errorhandler/LoggingCSSParseErrorHandler.java @@ -85,10 +85,18 @@ public static String createLoggingStringParseError (@Nonnull final Token aLastVa } final StringBuilder retval = new StringBuilder (1024); - retval.append ('[').append (aLastValidToken.next.beginLine).append (':').append (aLastValidToken.next.beginColumn).append (']'); + retval.append ('[') + .append (aLastValidToken.next.beginLine) + .append (':') + .append (aLastValidToken.next.beginColumn) + .append (']'); if (aLastSkippedToken != null) { - retval.append ("-[").append (aLastSkippedToken.endLine).append (':').append (aLastSkippedToken.endColumn).append (']'); + retval.append ("-[") + .append (aLastSkippedToken.endLine) + .append (':') + .append (aLastSkippedToken.endColumn) + .append (']'); } retval.append (" Encountered"); Token aCurToken = aLastValidToken.next; @@ -100,17 +108,22 @@ public static String createLoggingStringParseError (@Nonnull final Token aLastVa retval.append (aTokenImageVal[TOKEN_EOF]); break; } - retval.append ("text '").append (aCurToken.image).append ("' corresponding to token ").append (aTokenImageVal[aCurToken.kind]); + retval.append ("text '") + .append (aCurToken.image) + .append ("' corresponding to token ") + .append (aTokenImageVal[aCurToken.kind]); aCurToken = aCurToken.next; } retval.append (". "); if (aLastSkippedToken != null) retval.append ("Skipped until token ").append (aLastSkippedToken).append (". "); - retval.append (aExpectedTokenSequencesVal.length == 1 ? "Was expecting:" : "Was expecting one of:").append (aExpected); + retval.append (aExpectedTokenSequencesVal.length == 1 ? "Was expecting:" : "Was expecting one of:") + .append (aExpected); return retval.toString (); } - public void onCSSParseError (@Nonnull final ParseException aParseEx, @Nullable final Token aLastSkippedToken) throws ParseException + public void onCSSParseError (@Nonnull final ParseException aParseEx, @Nullable final Token aLastSkippedToken) + throws ParseException { if (aParseEx.expectedTokenSequences == null) LOGGER.warn (aParseEx.getMessage ()); @@ -140,7 +153,14 @@ public static String createLoggingStringUnexpectedRule (@Nonnull final Token aCu @Nonnull @Nonempty final String sRule, @Nonnull @Nonempty final String sMsg) { - return "[" + aCurrentToken.beginLine + ":" + aCurrentToken.beginColumn + "] Unexpected rule '" + sRule + "': " + sMsg; + return "[" + + aCurrentToken.beginLine + + ":" + + aCurrentToken.beginColumn + + "] Unexpected rule '" + + sRule + + "': " + + sMsg; } public void onCSSUnexpectedRule (@Nonnull final Token aCurrentToken, @@ -165,7 +185,8 @@ public void onCSSUnexpectedRule (@Nonnull final Token aCurrentToken, */ @Nonnull @Nonempty - public static String createLoggingStringDeprecatedProperty (@Nonnull final Token aPrefixToken, @Nonnull final Token aIdentifierToken) + public static String createLoggingStringDeprecatedProperty (@Nonnull final Token aPrefixToken, + @Nonnull final Token aIdentifierToken) { return "[" + aPrefixToken.beginLine + @@ -217,7 +238,12 @@ public void onCSSBrowserCompliantSkip (@Nullable final ParseException ex, @Nonempty public static String createLoggingStringIllegalCharacter (final char cIllegalChar) { - return "Found illegal character: " + cIllegalChar + " (0x" + StringHelper.getHexStringLeadingZero (cIllegalChar, 4) + ")"; + final String sCharHex = "0x" + StringHelper.getHexStringLeadingZero (cIllegalChar, 4); + final String sPrintableChar = cIllegalChar <= 32 || cIllegalChar > 255 ? sCharHex : cIllegalChar + + " (" + + sCharHex + + ")"; + return "Found illegal character: " + sPrintableChar; } @Override diff --git a/ph-css/src/test/java/com/helger/css/supplementary/issues/Issue91Test.java b/ph-css/src/test/java/com/helger/css/supplementary/issues/Issue91Test.java index 84a916ba..e1f84ad3 100644 --- a/ph-css/src/test/java/com/helger/css/supplementary/issues/Issue91Test.java +++ b/ph-css/src/test/java/com/helger/css/supplementary/issues/Issue91Test.java @@ -39,7 +39,7 @@ public final class Issue91Test @Test public void testUnescape1 () { - final String sCSS = "div { \73\72\63\3a\35 }"; + final String sCSS = "div { \\73\\72\\63\\3a\\35 }"; final CascadingStyleSheet aCSS = CSSReader.readFromStringReader (sCSS, new CSSReaderSettings ().setCSSVersion (ECSSVersion.LATEST) .setBrowserCompliantMode (true)); @@ -47,7 +47,7 @@ public void testUnescape1 () assertEquals (1, aCSS.getStyleRuleCount ()); final CSSStyleRule aSR = aCSS.getStyleRuleAtIndex (0); - assertEquals (2, aSR.getDeclarationCount ()); + assertEquals (1, aSR.getDeclarationCount ()); assertEquals ("div{src:5}", new CSSWriter (new CSSWriterSettings ().setOptimizedOutput (true)).setWriteHeaderText (false) diff --git a/ph-css/src/test/java/com/helger/css/supplementary/wiki/WikiCreateFontFaceRuleFuncTest.java b/ph-css/src/test/java/com/helger/css/supplementary/wiki/WikiCreateFontFaceRuleFuncTest.java index 1ddf16c7..1a676232 100644 --- a/ph-css/src/test/java/com/helger/css/supplementary/wiki/WikiCreateFontFaceRuleFuncTest.java +++ b/ph-css/src/test/java/com/helger/css/supplementary/wiki/WikiCreateFontFaceRuleFuncTest.java @@ -20,6 +20,8 @@ import static org.junit.Assert.assertNotNull; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.helger.css.ECSSVersion; import com.helger.css.decl.CascadingStyleSheet; @@ -33,12 +35,17 @@ */ public final class WikiCreateFontFaceRuleFuncTest { + private static final Logger LOGGER = LoggerFactory.getLogger (WikiCreateFontFaceRuleFuncTest.class); + @Test public void testBasic () { - final CascadingStyleSheet aCSS = WikiCreateFontFaceRule.createFontFace ("Your \"typeface\"", "local font name", "folder/", "myfont"); + final CascadingStyleSheet aCSS = WikiCreateFontFaceRule.createFontFace ("Your \"typeface\"", + "local font name", + "folder/", + "myfont"); final String sCSS = new CSSWriter (ECSSVersion.CSS30).getCSSAsString (aCSS); - System.out.println (sCSS); + LOGGER.info (sCSS); final CascadingStyleSheet aCSS2 = CSSReader.readFromString (sCSS, ECSSVersion.CSS30); assertNotNull (aCSS2);