#region PDFsharp - A .NET library for processing PDF // // Authors: // Stefan Lange // // Copyright (c) 2005-2017 empira Software GmbH, Cologne Area (Germany) // // http://www.pdfsharp.com // http://sourceforge.net/projects/pdfsharp // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. #endregion using System; using System.Globalization; using System.Diagnostics; using System.Text; using System.IO; using PdfSharp.Internal; #pragma warning disable 1591 namespace PdfSharp.Pdf.Content { /// /// Lexical analyzer for PDF content files. Adobe specifies no grammar, but it seems that it /// is a simple post-fix notation. /// public class CLexer { /// /// Initializes a new instance of the Lexer class. /// public CLexer(byte[] content) { _content = content; _charIndex = 0; } /// /// Initializes a new instance of the Lexer class. /// public CLexer(MemoryStream content) { _content = content.ToArray(); _charIndex = 0; } /// /// Reads the next token and returns its type. /// public CSymbol ScanNextToken() { Again: ClearToken(); char ch = MoveToNonWhiteSpace(); switch (ch) { case '%': // Eat comments, the parser doesn't handle them //return symbol = ScanComment(); ScanComment(); goto Again; case '/': return _symbol = ScanName(); //case 'R': // if (Lexer.IsWhiteSpace(nextChar)) // { // ScanNextChar(); // return Symbol.R; // } // break; case '+': case '-': return _symbol = ScanNumber(); case '[': ScanNextChar(); return _symbol = CSymbol.BeginArray; case ']': ScanNextChar(); return _symbol = CSymbol.EndArray; case '(': return _symbol = ScanLiteralString(); case '<': if (_nextChar == '<') return _symbol = ScanDictionary(); return _symbol = ScanHexadecimalString(); case '.': return _symbol = ScanNumber(); case '"': case '\'': return _symbol = ScanOperator(); } if (char.IsDigit(ch)) return _symbol = ScanNumber(); if (char.IsLetter(ch)) return _symbol = ScanOperator(); if (ch == Chars.EOF) return _symbol = CSymbol.Eof; ContentReaderDiagnostics.HandleUnexpectedCharacter(ch); return _symbol = CSymbol.None; } /// /// Scans a comment line. (Not yet used, comments are skipped by lexer.) /// public CSymbol ScanComment() { Debug.Assert(_currChar == Chars.Percent); ClearToken(); char ch; while ((ch = AppendAndScanNextChar()) != Chars.LF && ch != Chars.EOF) { } return _symbol = CSymbol.Comment; } /// /// Scans the bytes of an inline image. /// NYI: Just scans over it. /// public CSymbol ScanInlineImage() { // TODO: Implement inline images. // Skip this: // BI // … Key-value pairs … // ID // … Image data … // EI bool ascii85 = false; do { ScanNextToken(); // HACK: Is image ASCII85 decoded? if (!ascii85 && _symbol == CSymbol.Name && (Token == "/ASCII85Decode" || Token == "/A85")) ascii85 = true; } while (_symbol != CSymbol.Operator || Token != "ID"); if (ascii85) { // Look for '~>' because 'EI' may be part of the encoded image. while (_currChar != Chars.EOF && (_currChar != '~' || _nextChar != '>')) ScanNextChar(); if (_currChar == Chars.EOF) ContentReaderDiagnostics.HandleUnexpectedCharacter(_currChar); } // Look for 'EI', as 'EI' may be part of the binary image data here too. while (_currChar != Chars.EOF) { if (IsWhiteSpace(_currChar)) { if (ScanNextChar() == 'E') if (ScanNextChar() == 'I') if (IsWhiteSpace(ScanNextChar())) break; } else ScanNextChar(); } if (_currChar == Chars.EOF) ContentReaderDiagnostics.HandleUnexpectedCharacter(_currChar); // We currently do nothing with inline images. return CSymbol.None; } /// /// Scans a name. /// public CSymbol ScanName() { Debug.Assert(_currChar == Chars.Slash); ClearToken(); while (true) { char ch = AppendAndScanNextChar(); if (IsWhiteSpace(ch) || IsDelimiter(ch)) return _symbol = CSymbol.Name; if (ch == '#') { ScanNextChar(); char[] hex = new char[2]; hex[0] = _currChar; hex[1] = _nextChar; ScanNextChar(); // TODO Check syntax ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); _currChar = ch; } } } protected CSymbol ScanDictionary() { // TODO Do an actual recursive parse instead of this simple scan. ClearToken(); _token.Append(_currChar); // '<' _token.Append(ScanNextChar()); // '<' bool inString = false, inHexString = false; int nestedDict = 0, nestedStringParen = 0; char ch; while (true) { _token.Append(ch = ScanNextChar()); if (ch == '<') { if (_nextChar == '<') { _token.Append(ScanNextChar()); ++nestedDict; } else inHexString = true; } else if (!inHexString && ch == '(') { if (inString) ++nestedStringParen; else { inString = true; nestedStringParen = 0; } } else if (inString && ch == ')') { if (nestedStringParen > 0) --nestedStringParen; else inString = false; } else if (inString && ch == '\\') _token.Append(ScanNextChar()); else if (ch == '>') { if (inHexString) inHexString = false; else if (_nextChar == '>') { _token.Append(ScanNextChar()); if (nestedDict > 0) --nestedDict; else { ScanNextChar(); #if true return CSymbol.Dictionary; #else return CSymbol.String; #endif } } } else if (ch == Chars.EOF) ContentReaderDiagnostics.HandleUnexpectedCharacter(ch); } } /// /// Scans an integer or real number. /// public CSymbol ScanNumber() { long value = 0; int decimalDigits = 0; bool period = false; bool negative = false; ClearToken(); char ch = _currChar; if (ch == '+' || ch == '-') { if (ch == '-') negative = true; _token.Append(ch); ch = ScanNextChar(); } while (true) { if (char.IsDigit(ch)) { _token.Append(ch); if (decimalDigits < 10) { value = 10 * value + ch - '0'; if (period) decimalDigits++; } } else if (ch == '.') { if (period) ContentReaderDiagnostics.ThrowContentReaderException("More than one period in number."); period = true; _token.Append(ch); } else break; ch = ScanNextChar(); } if (negative) value = -value; if (period) { if (decimalDigits > 0) { _tokenAsReal = value / PowersOf10[decimalDigits]; //_tokenAsLong = value / PowersOf10[decimalDigits]; } else { _tokenAsReal = value; _tokenAsLong = value; } return CSymbol.Real; } _tokenAsLong = value; _tokenAsReal = Convert.ToDouble(value); Debug.Assert(Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture) == value); if (value >= Int32.MinValue && value < Int32.MaxValue) return CSymbol.Integer; ContentReaderDiagnostics.ThrowNumberOutOfIntegerRange(value); return CSymbol.Error; } static readonly double[] PowersOf10 = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000 }; /// /// Scans an operator. /// public CSymbol ScanOperator() { ClearToken(); char ch = _currChar; // Scan token while (IsOperatorChar(ch)) ch = AppendAndScanNextChar(); return _symbol = CSymbol.Operator; } // TODO public CSymbol ScanLiteralString() { Debug.Assert(_currChar == Chars.ParenLeft); ClearToken(); int parenLevel = 0; char ch = ScanNextChar(); // Test UNICODE string if (ch == '\xFE' && _nextChar == '\xFF') { // I'm not sure if the code is correct in any case. // ? Can a UNICODE character not start with ')' as hibyte // ? What about \# escape sequences ScanNextChar(); char chHi = ScanNextChar(); if (chHi == ')') { // The empty unicode string... ScanNextChar(); return _symbol = CSymbol.String; } char chLo = ScanNextChar(); ch = (char)(chHi * 256 + chLo); while (true) { SkipChar: switch (ch) { case '(': parenLevel++; break; case ')': if (parenLevel == 0) { ScanNextChar(); return _symbol = CSymbol.String; } parenLevel--; break; case '\\': { // TODO: not sure that this is correct... ch = ScanNextChar(); switch (ch) { case 'n': ch = Chars.LF; break; case 'r': ch = Chars.CR; break; case 't': ch = Chars.HT; break; case 'b': ch = Chars.BS; break; case 'f': ch = Chars.FF; break; case '(': ch = Chars.ParenLeft; break; case ')': ch = Chars.ParenRight; break; case '\\': ch = Chars.BackSlash; break; case Chars.LF: ch = ScanNextChar(); goto SkipChar; default: if (char.IsDigit(ch)) { // Octal character code int n = ch - '0'; if (char.IsDigit(_nextChar)) { n = n * 8 + ScanNextChar() - '0'; if (char.IsDigit(_nextChar)) n = n * 8 + ScanNextChar() - '0'; } ch = (char)n; } break; } break; } //case '#': // ContentReaderDiagnostics.HandleUnexpectedCharacter('#'); // break; default: // Every other char is appended to the token. break; } _token.Append(ch); chHi = ScanNextChar(); if (chHi == ')') { ScanNextChar(); return _symbol = CSymbol.String; } chLo = ScanNextChar(); ch = (char)(chHi * 256 + chLo); } } else { // 8-bit characters while (true) { SkipChar: switch (ch) { case '(': parenLevel++; break; case ')': if (parenLevel == 0) { ScanNextChar(); return _symbol = CSymbol.String; } parenLevel--; break; case '\\': { ch = ScanNextChar(); switch (ch) { case 'n': ch = Chars.LF; break; case 'r': ch = Chars.CR; break; case 't': ch = Chars.HT; break; case 'b': ch = Chars.BS; break; case 'f': ch = Chars.FF; break; case '(': ch = Chars.ParenLeft; break; case ')': ch = Chars.ParenRight; break; case '\\': ch = Chars.BackSlash; break; case Chars.LF: ch = ScanNextChar(); goto SkipChar; default: if (char.IsDigit(ch)) { // Octal character code. int n = ch - '0'; if (char.IsDigit(_nextChar)) { n = n * 8 + ScanNextChar() - '0'; if (char.IsDigit(_nextChar)) n = n * 8 + ScanNextChar() - '0'; } ch = (char)n; } break; } break; } //case '#': // ContentReaderDiagnostics.HandleUnexpectedCharacter('#'); // break; default: // Every other char is appended to the token. break; } _token.Append(ch); //token.Append(Encoding.GetEncoding(1252).GetString(new byte[] { (byte)ch })); ch = ScanNextChar(); } } } // TODO public CSymbol ScanHexadecimalString() { Debug.Assert(_currChar == Chars.Less); ClearToken(); char[] hex = new char[2]; ScanNextChar(); while (true) { MoveToNonWhiteSpace(); if (_currChar == '>') { ScanNextChar(); break; } if (char.IsLetterOrDigit(_currChar)) { hex[0] = char.ToUpper(_currChar); hex[1] = char.ToUpper(_nextChar); int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); _token.Append(Convert.ToChar(ch)); ScanNextChar(); ScanNextChar(); } } string chars = _token.ToString(); int count = chars.Length; if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF) { Debug.Assert(count % 2 == 0); _token.Length = 0; for (int idx = 2; idx < count; idx += 2) _token.Append((char)(chars[idx] * 256 + chars[idx + 1])); } return _symbol = CSymbol.HexString; } /// /// Move current position one character further in content stream. /// internal char ScanNextChar() { if (ContLength <= _charIndex) { _currChar = Chars.EOF; if (IsOperatorChar(_nextChar)) _token.Append(_nextChar); _nextChar = Chars.EOF; } else { _currChar = _nextChar; _nextChar = (char)_content[_charIndex++]; if (_currChar == Chars.CR) { if (_nextChar == Chars.LF) { // Treat CR LF as LF _currChar = _nextChar; if (ContLength <= _charIndex) _nextChar = Chars.EOF; else _nextChar = (char)_content[_charIndex++]; } else { // Treat single CR as LF _currChar = Chars.LF; } } } return _currChar; } /// /// Resets the current token to the empty string. /// void ClearToken() { _token.Length = 0; _tokenAsLong = 0; _tokenAsReal = 0; } /// /// Appends current character to the token and reads next one. /// internal char AppendAndScanNextChar() { _token.Append(_currChar); return ScanNextChar(); } /// /// If the current character is not a white space, the function immediately returns it. /// Otherwise the PDF cursor is moved forward to the first non-white space or EOF. /// White spaces are NUL, HT, LF, FF, CR, and SP. /// public char MoveToNonWhiteSpace() { while (_currChar != Chars.EOF) { switch (_currChar) { case Chars.NUL: case Chars.HT: case Chars.LF: case Chars.FF: case Chars.CR: case Chars.SP: ScanNextChar(); break; default: return _currChar; } } return _currChar; } /// /// Gets or sets the current symbol. /// public CSymbol Symbol { get { return _symbol; } set { _symbol = value; } } /// /// Gets the current token. /// public string Token { get { return _token.ToString(); } } /// /// Interprets current token as integer literal. /// internal int TokenToInteger { get { Debug.Assert(_tokenAsLong == int.Parse(_token.ToString(), CultureInfo.InvariantCulture)); return (int)_tokenAsLong; } } /// /// Interpret current token as real or integer literal. /// internal double TokenToReal { get { // ReSharper disable once CompareOfFloatsByEqualityOperator Debug.Assert(_tokenAsReal == double.Parse(_token.ToString(), CultureInfo.InvariantCulture)); return _tokenAsReal; } } /// /// Indicates whether the specified character is a content stream white-space character. /// internal static bool IsWhiteSpace(char ch) { switch (ch) { case Chars.NUL: // 0 Null case Chars.HT: // 9 Tab case Chars.LF: // 10 Line feed case Chars.FF: // 12 Form feed case Chars.CR: // 13 Carriage return case Chars.SP: // 32 Space return true; } return false; } /// /// Indicates whether the specified character is an content operator character. /// internal static bool IsOperatorChar(char ch) { if (char.IsLetter(ch)) return true; switch (ch) { case Chars.Asterisk: // * case Chars.QuoteSingle: // ' case Chars.QuoteDbl: // " return true; } return false; } /// /// Indicates whether the specified character is a PDF delimiter character. /// internal static bool IsDelimiter(char ch) { switch (ch) { case '(': case ')': case '<': case '>': case '[': case ']': //case '{': //case '}': case '/': case '%': return true; } return false; } /// /// Gets the length of the content. /// public int ContLength { get { return _content.Length; } } // ad public int Position { get { return _charIndex; } set { _charIndex = value; _currChar = (char)_content[_charIndex - 1]; _nextChar = (char)_content[_charIndex - 1]; } } readonly byte[] _content; int _charIndex; char _currChar; char _nextChar; readonly StringBuilder _token = new StringBuilder(); long _tokenAsLong; double _tokenAsReal; CSymbol _symbol = CSymbol.None; } }