ASCU_ALL/PrintPDF/PdfSharp/Pdf.IO/Lexer.cs

#region PDFsharp - A .NET library for processing PDF
//
// Authors:
//   Stefan Lange
//
// Copyright (c) 2005-2017 empira Software GmbH, Cologne Area (Germany)
//
// http://www.pdfsharp.com
// http://sourceforge.net/projects/pdfsharp
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
// DEALINGS IN THE SOFTWARE.
#endregion

using System;
using System.Globalization;
using System.Diagnostics;
using System.Text;
using System.IO;
using PdfSharp.Internal;
using PdfSharp.Pdf.Internal;

#pragma warning disable 1591

namespace PdfSharp.Pdf.IO
{
    /// <summary>
    /// Lexical analyzer for PDF files. Technically a PDF file is a stream of bytes. Some chunks
    /// of bytes represent strings in several encodings. The actual encoding depends on the
    /// context where the string is used. Therefore the bytes are 'raw encoded' into characters,
    /// i.e. a character or token read by the lexer has always character values in the range from
    /// 0 to 255.
    /// </summary>
    public class Lexer
    {
        /// <summary>
        /// Initializes a new instance of the Lexer class.
        /// </summary>
        public Lexer(Stream pdfInputStream)
        {
            _pdfSteam = pdfInputStream;
            _pdfLength = (int)_pdfSteam.Length;
            _idxChar = 0;
            Position = 0;
        }

        /// <summary>
        /// Gets or sets the position within the PDF stream.
        /// </summary>
        public int Position
        {
            get { return _idxChar; }
            set
            {
                _idxChar = value;
                _pdfSteam.Position = value;
                // ReadByte return -1 (eof) at the end of the stream.
                _currChar = (char)_pdfSteam.ReadByte();
                _nextChar = (char)_pdfSteam.ReadByte();
                _token = new StringBuilder();
            }
        }

        /// <summary>
        /// Reads the next token and returns its type. If the token starts with a digit, the parameter
        /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
        /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
        /// the token is set to the object ID followed by the generation number separated by a blank
        /// (the 'R' is omitted from the token).
        /// </summary>
        // /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
        public Symbol ScanNextToken()
        {
            Again:
            _token = new StringBuilder();

            char ch = MoveToNonWhiteSpace();
            switch (ch)
            {
                case '%':
                    // Eat comments, the parser doesn't handle them
                    //return symbol = ScanComment();
                    ScanComment();
                    goto Again;

                case '/':
                    return _symbol = ScanName();

                //case 'R':
                //  if (Lexer.IsWhiteSpace(nextChar))
                //  {
                //    ScanNextChar();
                //    return Symbol.R;
                //  }
                //  break;

                case '+': //TODO is it so easy?
                case '-':
                    return _symbol = ScanNumber();

                case '(':
                    return _symbol = ScanLiteralString();

                case '[':
                    ScanNextChar(true);
                    return _symbol = Symbol.BeginArray;

                case ']':
                    ScanNextChar(true);
                    return _symbol = Symbol.EndArray;

                case '<':
                    if (_nextChar == '<')
                    {
                        ScanNextChar(true);
                        ScanNextChar(true);
                        return _symbol = Symbol.BeginDictionary;
                    }
                    return _symbol = ScanHexadecimalString();

                case '>':
                    if (_nextChar == '>')
                    {
                        ScanNextChar(true);
                        ScanNextChar(true);
                        return _symbol = Symbol.EndDictionary;
                    }
                    ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
                    break;

                case '.':
                    return _symbol = ScanNumber();
            }
            if (char.IsDigit(ch))
#if true_
                return ScanNumberOrReference();
#else
                if (PeekReference())
                    return _symbol = ScanNumber();
                else
                    return _symbol = ScanNumber();
#endif

            if (char.IsLetter(ch))
                return _symbol = ScanKeyword();

            if (ch == Chars.EOF)
                return _symbol = Symbol.Eof;

            // #???

            ParserDiagnostics.HandleUnexpectedCharacter(ch);
            return _symbol = Symbol.None;
        }

        /// <summary>
        /// Reads the raw content of a stream.
        /// </summary>
        public byte[] ReadStream(int length)
        {
            int pos;

            // Skip illegal blanks behind <20>stream<61>.
            while (_currChar == Chars.SP)
                ScanNextChar(true);

            // Skip new line behind <20>stream<61>.
            if (_currChar == Chars.CR)
            {
                if (_nextChar == Chars.LF)
                    pos = _idxChar + 2;
                else
                    pos = _idxChar + 1;
            }
            else
                pos = _idxChar + 1;

            _pdfSteam.Position = pos;
            byte[] bytes = new byte[length];
            int read = _pdfSteam.Read(bytes, 0, length);
            Debug.Assert(read == length);
            // With corrupted files, read could be different from length.
            if (bytes.Length != read)
            {
                Array.Resize(ref bytes, read);
            }

            // Synchronize idxChar etc.
            Position = pos + read;
            return bytes;
        }

        /// <summary>
        /// Reads a string in raw encoding.
        /// </summary>
        public String ReadRawString(int position, int length)
        {
            _pdfSteam.Position = position;
            byte[] bytes = new byte[length];
            _pdfSteam.Read(bytes, 0, length);
            return PdfEncoders.RawEncoding.GetString(bytes, 0, bytes.Length);
        }

        /// <summary>
        /// Scans a comment line.
        /// </summary>
        public Symbol ScanComment()
        {
            Debug.Assert(_currChar == Chars.Percent);

            _token = new StringBuilder();
            while (true)
            {
                char ch = AppendAndScanNextChar();
                if (ch == Chars.LF || ch == Chars.EOF)
                    break;
            }
            // TODO: not correct
            if (_token.ToString().StartsWith("%%EOF"))
                return Symbol.Eof;
            return _symbol = Symbol.Comment;
        }

        /// <summary>
        /// Scans a name.
        /// </summary>
        public Symbol ScanName()
        {
            Debug.Assert(_currChar == Chars.Slash);

            _token = new StringBuilder();
            while (true)
            {
                char ch = AppendAndScanNextChar();
                if (IsWhiteSpace(ch) || IsDelimiter(ch) || ch == Chars.EOF)
                    return _symbol = Symbol.Name;

                if (ch == '#')
                {
                    ScanNextChar(true);
                    char[] hex = new char[2];
                    hex[0] = _currChar;
                    hex[1] = _nextChar;
                    ScanNextChar(true);
                    // TODO Check syntax
                    ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
                    _currChar = ch;
                }
            }
        }

        /// <summary>
        /// Scans a number.
        /// </summary>
        public Symbol ScanNumber()
        {
            // I found a PDF file created with Acrobat 7 with this entry 
            //   /Checksum 2996984786
            // What is this? It is neither an integer nor a real.
            // I introduced an UInteger...
            bool period = false;
            //bool sign;

            _token = new StringBuilder();
            char ch = _currChar;
            if (ch == '+' || ch == '-')
            {
                //sign = true;
                _token.Append(ch);
                ch = ScanNextChar(true);
            }
            while (true)
            {
                if (char.IsDigit(ch))
                {
                    _token.Append(ch);
                }
                else if (ch == '.')
                {
                    if (period)
                        ParserDiagnostics.ThrowParserException("More than one period in number.");

                    period = true;
                    _token.Append(ch);
                }
                else
                    break;
                ch = ScanNextChar(true);
            }

            if (period)
                return Symbol.Real;
            long l = Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture);
            if (l >= Int32.MinValue && l <= Int32.MaxValue)
                return Symbol.Integer;
            if (l > 0 && l <= UInt32.MaxValue)
                return Symbol.UInteger;

            // Got an AutoCAD PDF file that contains this: /C 264584027963392
            // Best we can do is to convert it to real value.
            return Symbol.Real;
            //thr ow new PdfReaderException("Number exceeds integer range.");
        }

        public Symbol ScanNumberOrReference()
        {
            Symbol result = ScanNumber();
            if (result == Symbol.Integer)
            {
                int pos = Position;
                string objectNumber = Token;
            }
            return result;
        }

        /// <summary>
        /// Scans a keyword.
        /// </summary>
        public Symbol ScanKeyword()
        {
            _token = new StringBuilder();
            char ch = _currChar;
            // Scan token
            while (true)
            {
                if (char.IsLetter(ch))
                    _token.Append(ch);
                else
                    break;
                ch = ScanNextChar(false);
            }

            // Check known tokens.
            switch (_token.ToString())
            {
                case "obj":
                    return _symbol = Symbol.Obj;

                case "endobj":
                    return _symbol = Symbol.EndObj;

                case "null":
                    return _symbol = Symbol.Null;

                case "true":
                case "false":
                    return _symbol = Symbol.Boolean;

                case "R":
                    return _symbol = Symbol.R;

                case "stream":
                    return _symbol = Symbol.BeginStream;

                case "endstream":
                    return _symbol = Symbol.EndStream;

                case "xref":
                    return _symbol = Symbol.XRef;

                case "trailer":
                    return _symbol = Symbol.Trailer;

                case "startxref":
                    return _symbol = Symbol.StartXRef;
            }

            // Anything else is treated as a keyword. Samples are f or n in iref.
            return _symbol = Symbol.Keyword;
        }

        /// <summary>
        /// Scans a literal string, contained between "(" and ")".
        /// </summary>
        public Symbol ScanLiteralString()
        {
            // Reference: 3.2.3  String Objects / Page 53
            // Reference: TABLE 3.32  String Types / Page 157

            Debug.Assert(_currChar == Chars.ParenLeft);
            _token = new StringBuilder();
            int parenLevel = 0;
            char ch = ScanNextChar(false);

            // Phase 1: deal with escape characters.
            while (ch != Chars.EOF)
            {
                switch (ch)
                {
                    case '(':
                        parenLevel++;
                        break;

                    case ')':
                        if (parenLevel == 0)
                        {
                            ScanNextChar(false);
                            // Is goto evil? We could move Phase 2 code here or create a subroutine for Phase 1.
                            goto Phase2;
                        }
                        parenLevel--;
                        break;

                    case '\\':
                        {
                            ch = ScanNextChar(false);
                            switch (ch)
                            {
                                case 'n':
                                    ch = Chars.LF;
                                    break;

                                case 'r':
                                    ch = Chars.CR;
                                    break;

                                case 't':
                                    ch = Chars.HT;
                                    break;

                                case 'b':
                                    ch = Chars.BS;
                                    break;

                                case 'f':
                                    ch = Chars.FF;
                                    break;

                                case '(':
                                    ch = Chars.ParenLeft;
                                    break;

                                case ')':
                                    ch = Chars.ParenRight;
                                    break;

                                case '\\':
                                    ch = Chars.BackSlash;
                                    break;

                                // AutoCAD PDFs my contain such strings: (\ ) 
                                case ' ':
                                    ch = ' ';
                                    break;

                                case Chars.CR:
                                case Chars.LF:
                                    ch = ScanNextChar(false);
                                    continue;

                                default:
                                    // TODO IsOctalDigit(ch).
                                    if (char.IsDigit(ch) && _nextChar != '8' && _nextChar != '9')  // First octal character.
                                    {
                                        //// Octal character code.
                                        //if (ch >= '8')
                                        //    ParserDiagnostics.HandleUnexpectedCharacter(ch);

                                        int n = ch - '0';
                                        if (char.IsDigit(_nextChar) && _nextChar != '8' && _nextChar != '9')  // Second octal character.
                                        {
                                            ch = ScanNextChar(false);
                                            //if (ch >= '8')
                                            //    ParserDiagnostics.HandleUnexpectedCharacter(ch);

                                            n = n * 8 + ch - '0';
                                            if (char.IsDigit(_nextChar) && _nextChar != '8' && _nextChar != '9')  // Third octal character.
                                            {
                                                ch = ScanNextChar(false);
                                                //if (ch >= '8')
                                                //    ParserDiagnostics.HandleUnexpectedCharacter(ch);

                                                n = n * 8 + ch - '0';
                                            }
                                        }
                                        ch = (char)n;
                                    }
                                    else
                                    {
                                        // PDF 32000: "If the character following the REVERSE SOLIDUS is not one of those shown in Table 3, the REVERSE SOLIDUS shall be ignored."
                                        //TODO
                                        // Debug.As sert(false, "Not implemented; unknown escape character.");
                                        // ParserDiagnostics.HandleUnexpectedCharacter(ch);
                                        //GetType();
                                    }
                                    break;
                            }
                            break;
                        }
                    default:
                        break;
                }

                _token.Append(ch);
                ch = ScanNextChar(false);
            }

            // Phase 2: deal with UTF-16BE if necessary.
            // UTF-16BE Unicode strings start with U+FEFF ("<22><>"). There can be empty strings with UTF-16BE prefix.
            Phase2:
            if (_token.Length >= 2 && _token[0] == '\xFE' && _token[1] == '\xFF')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp = _token;
                int length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i] + temp[i + 1]));
                }
                return _symbol = Symbol.UnicodeString;
            }
            // Adobe Reader also supports UTF-16LE.
            if (_token.Length >= 2 && _token[0] == '\xFF' && _token[1] == '\xFE')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp = _token;
                int length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i + 1] + temp[i]));
                }
                return _symbol = Symbol.UnicodeString;
            }
            return _symbol = Symbol.String;
        }

        public Symbol ScanHexadecimalString()
        {
            Debug.Assert(_currChar == Chars.Less);

            _token = new StringBuilder();
            char[] hex = new char[2];
            ScanNextChar(true);
            while (true)
            {
                MoveToNonWhiteSpace();
                if (_currChar == '>')
                {
                    ScanNextChar(true);
                    break;
                }
                if (char.IsLetterOrDigit(_currChar))
                {
                    hex[0] = char.ToUpper(_currChar);
                    // Second char is optional in PDF spec.
                    if (char.IsLetterOrDigit(_nextChar))
                    {
                        hex[1] = char.ToUpper(_nextChar);
                        ScanNextChar(true);
                    }
                    else
                    {
                        // We could check for ">" here and throw if we find anything else. The throw comes after the next iteration anyway.
                        hex[1] = '0';
                    }
                    ScanNextChar(true);

                    int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
                    _token.Append(Convert.ToChar(ch));
                }
                else
                    ParserDiagnostics.HandleUnexpectedCharacter(_currChar);
            }
            string chars = _token.ToString();
            int count = chars.Length;
            if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF)
            {
                Debug.Assert(count % 2 == 0);
                _token.Length = 0;
                for (int idx = 2; idx < count; idx += 2)
                    _token.Append((char)(chars[idx] * 256 + chars[idx + 1]));
                return _symbol = Symbol.UnicodeHexString;
            }
            return _symbol = Symbol.HexString;
        }

        /// <summary>
        /// Move current position one character further in PDF stream.
        /// </summary>
        internal char ScanNextChar(bool handleCRLF)
        {
            if (_pdfLength <= _idxChar)
            {
                _currChar = Chars.EOF;
                _nextChar = Chars.EOF;
            }
            else
            {
                _currChar = _nextChar;
                _nextChar = (char)_pdfSteam.ReadByte();
                _idxChar++;
                if (handleCRLF && _currChar == Chars.CR)
                {
                    if (_nextChar == Chars.LF)
                    {
                        // Treat CR LF as LF.
                        _currChar = _nextChar;
                        _nextChar = (char)_pdfSteam.ReadByte();
                        _idxChar++;
                    }
                    else
                    {
                        // Treat single CR as LF.
                        _currChar = Chars.LF;
                    }
                }
            }
            return _currChar;
        }

        ///// <summary>
        ///// Resets the current token to the empty string.
        ///// </summary>
        //void ClearToken()
        //{
        //    _token.Length = 0;
        //}

        bool PeekReference()
        {
            // A Reference has the form "nnn mmm R". The implementation of the parser used a
            // reduce/shift algorithm in the first place. But this case is the only one we need to
            // look ahead 3 tokens. 
            int positon = Position;

            // Skip digits.
            while (char.IsDigit(_currChar))
                ScanNextChar(true);

            // Space expected.
            if (_currChar != Chars.SP)
                goto False;

            // Skip spaces.
            while (_currChar == Chars.SP)
                ScanNextChar(true);

            // Digit expected.
            if (!char.IsDigit(_currChar))
                goto False;

            // Skip digits.
            while (char.IsDigit(_currChar))
                ScanNextChar(true);

            // Space expected.
            if (_currChar != Chars.SP)
                goto False;

            // Skip spaces.
            while (_currChar == Chars.SP)
                ScanNextChar(true);

            // "R" expected.
            // We can ignore _nextChar because there is no other valid token that starts with an 'R'.
            if (_currChar != 'R')
                goto False;

            Position = positon;
            return true;

            False:
            Position = positon;
            return false;
        }

        /// <summary>
        /// Appends current character to the token and reads next one.
        /// </summary>
        internal char AppendAndScanNextChar()
        {
            if (_currChar == Chars.EOF)
                ParserDiagnostics.ThrowParserException("Undetected EOF reached.");

            _token.Append(_currChar);
            return ScanNextChar(true);
        }

        /// <summary>
        /// If the current character is not a white space, the function immediately returns it.
        /// Otherwise the PDF cursor is moved forward to the first non-white space or EOF.
        /// White spaces are NUL, HT, LF, FF, CR, and SP.
        /// </summary>
        public char MoveToNonWhiteSpace()
        {
            while (_currChar != Chars.EOF)
            {
                switch (_currChar)
                {
                    case Chars.NUL:
                    case Chars.HT:
                    case Chars.LF:
                    case Chars.FF:
                    case Chars.CR:
                    case Chars.SP:
                        ScanNextChar(true);
                        break;

                    case (char)11:
                    case (char)173:
                        ScanNextChar(true);
                        break;


                    default:
                        return _currChar;
                }
            }
            return _currChar;
        }

#if DEBUG
        public string SurroundingsOfCurrentPosition(bool hex)
        {
            const int range = 20;
            int start = Math.Max(Position - range, 0);
            int length = Math.Min(2 * range, PdfLength - start);
            long posOld = _pdfSteam.Position;
            _pdfSteam.Position = start;
            byte[] bytes = new byte[length];
            _pdfSteam.Read(bytes, 0, length);
            _pdfSteam.Position = posOld;
            string result = "";
            if (hex)
            {
                for (int idx = 0; idx < length; idx++)
                    result += ((int)bytes[idx]).ToString("x2");
                //result += string.Format("{0:", (int) bytes[idx]);
            }
            else
            {
                for (int idx = 0; idx < length; idx++)
                    result += (char)bytes[idx];
            }
            return result;
        }
#endif

        /// <summary>
        /// Gets the current symbol.
        /// </summary>
        public Symbol Symbol
        {
            get { return _symbol; }
            set { _symbol = value; }
        }

        /// <summary>
        /// Gets the current token.
        /// </summary>
        public string Token
        {
            get { return _token.ToString(); }
        }

        /// <summary>
        /// Interprets current token as boolean literal.
        /// </summary>
        public bool TokenToBoolean
        {
            get
            {
                Debug.Assert(_token.ToString() == "true" || _token.ToString() == "false");
                return _token.ToString()[0] == 't';
            }
        }

        /// <summary>
        /// Interprets current token as integer literal.
        /// </summary>
        public int TokenToInteger
        {
            get
            {
                //Debug.As sert(_token.ToString().IndexOf('.') == -1);
                return int.Parse(_token.ToString(), CultureInfo.InvariantCulture);
            }
        }

        /// <summary>
        /// Interprets current token as unsigned integer literal.
        /// </summary>
        public uint TokenToUInteger
        {
            get
            {
                //Debug.As sert(_token.ToString().IndexOf('.') == -1);
                return uint.Parse(_token.ToString(), CultureInfo.InvariantCulture);
            }
        }

        /// <summary>
        /// Interprets current token as real or integer literal.
        /// </summary>
        public double TokenToReal
        {
            get { return double.Parse(_token.ToString(), CultureInfo.InvariantCulture); }
        }

        /// <summary>
        /// Interprets current token as object ID.
        /// </summary>
        public PdfObjectID TokenToObjectID
        {
            get
            {
                string[] numbers = Token.Split('|');
                int objectNumber = Int32.Parse(numbers[0]);
                int generationNumber = Int32.Parse(numbers[1]);
                return new PdfObjectID(objectNumber, generationNumber);
            }
        }

        /// <summary>
        /// Indicates whether the specified character is a PDF white-space character.
        /// </summary>
        internal static bool IsWhiteSpace(char ch)
        {
            switch (ch)
            {
                case Chars.NUL:  // 0 Null
                case Chars.HT:   // 9 Horizontal Tab
                case Chars.LF:   // 10 Line Feed
                case Chars.FF:   // 12 Form Feed
                case Chars.CR:   // 13 Carriage Return
                case Chars.SP:   // 32 Space
                    return true;
            }
            return false;
        }

        /// <summary>
        /// Indicates whether the specified character is a PDF delimiter character.
        /// </summary>
        internal static bool IsDelimiter(char ch)
        {
            switch (ch)
            {
                case '(':
                case ')':
                case '<':
                case '>':
                case '[':
                case ']':
                case '{':
                case '}':
                case '/':
                case '%':
                    return true;
            }
            return false;
        }

        /// <summary>
        /// Gets the length of the PDF output.
        /// </summary>
        public int PdfLength
        {
            get { return _pdfLength; }
        }

        readonly int _pdfLength;
        int _idxChar;
        char _currChar;
        char _nextChar;
        StringBuilder _token;
        Symbol _symbol = Symbol.None;

        readonly Stream _pdfSteam;
    }
}