2021-05-25 17:00:45 +05:00

903 lines
30 KiB
C#
Raw Blame History

#region PDFsharp - A .NET library for processing PDF
//
// Authors:
// Stefan Lange
//
// Copyright (c) 2005-2017 empira Software GmbH, Cologne Area (Germany)
//
// http://www.pdfsharp.com
// http://sourceforge.net/projects/pdfsharp
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
#endregion
using System;
using System.Globalization;
using System.Diagnostics;
using System.Text;
using System.IO;
using PdfSharp.Internal;
using PdfSharp.Pdf.Internal;
#pragma warning disable 1591
namespace PdfSharp.Pdf.IO
{
/// <summary>
/// Lexical analyzer for PDF files. Technically a PDF file is a stream of bytes. Some chunks
/// of bytes represent strings in several encodings. The actual encoding depends on the
/// context where the string is used. Therefore the bytes are 'raw encoded' into characters,
/// i.e. a character or token read by the lexer has always character values in the range from
/// 0 to 255.
/// </summary>
public class Lexer
{
/// <summary>
/// Initializes a new instance of the Lexer class.
/// </summary>
public Lexer(Stream pdfInputStream)
{
_pdfSteam = pdfInputStream;
_pdfLength = (int)_pdfSteam.Length;
_idxChar = 0;
Position = 0;
}
/// <summary>
/// Gets or sets the position within the PDF stream.
/// </summary>
public int Position
{
get { return _idxChar; }
set
{
_idxChar = value;
_pdfSteam.Position = value;
// ReadByte return -1 (eof) at the end of the stream.
_currChar = (char)_pdfSteam.ReadByte();
_nextChar = (char)_pdfSteam.ReadByte();
_token = new StringBuilder();
}
}
/// <summary>
/// Reads the next token and returns its type. If the token starts with a digit, the parameter
/// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
/// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
/// the token is set to the object ID followed by the generation number separated by a blank
/// (the 'R' is omitted from the token).
/// </summary>
// /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
public Symbol ScanNextToken()
{
Again:
_token = new StringBuilder();
char ch = MoveToNonWhiteSpace();
switch (ch)
{
case '%':
// Eat comments, the parser doesn't handle them
//return symbol = ScanComment();
ScanComment();
goto Again;
case '/':
return _symbol = ScanName();
//case 'R':
// if (Lexer.IsWhiteSpace(nextChar))
// {
// ScanNextChar();
// return Symbol.R;
// }
// break;
case '+': //TODO is it so easy?
case '-':
return _symbol = ScanNumber();
case '(':
return _symbol = ScanLiteralString();
case '[':
ScanNextChar(true);
return _symbol = Symbol.BeginArray;
case ']':
ScanNextChar(true);
return _symbol = Symbol.EndArray;
case '<':
if (_nextChar == '<')
{
ScanNextChar(true);
ScanNextChar(true);
return _symbol = Symbol.BeginDictionary;
}
return _symbol = ScanHexadecimalString();
case '>':
if (_nextChar == '>')
{
ScanNextChar(true);
ScanNextChar(true);
return _symbol = Symbol.EndDictionary;
}
ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
break;
case '.':
return _symbol = ScanNumber();
}
if (char.IsDigit(ch))
#if true_
return ScanNumberOrReference();
#else
if (PeekReference())
return _symbol = ScanNumber();
else
return _symbol = ScanNumber();
#endif
if (char.IsLetter(ch))
return _symbol = ScanKeyword();
if (ch == Chars.EOF)
return _symbol = Symbol.Eof;
// #???
ParserDiagnostics.HandleUnexpectedCharacter(ch);
return _symbol = Symbol.None;
}
/// <summary>
/// Reads the raw content of a stream.
/// </summary>
public byte[] ReadStream(int length)
{
int pos;
// Skip illegal blanks behind <20>stream<61>.
while (_currChar == Chars.SP)
ScanNextChar(true);
// Skip new line behind <20>stream<61>.
if (_currChar == Chars.CR)
{
if (_nextChar == Chars.LF)
pos = _idxChar + 2;
else
pos = _idxChar + 1;
}
else
pos = _idxChar + 1;
_pdfSteam.Position = pos;
byte[] bytes = new byte[length];
int read = _pdfSteam.Read(bytes, 0, length);
Debug.Assert(read == length);
// With corrupted files, read could be different from length.
if (bytes.Length != read)
{
Array.Resize(ref bytes, read);
}
// Synchronize idxChar etc.
Position = pos + read;
return bytes;
}
/// <summary>
/// Reads a string in raw encoding.
/// </summary>
public String ReadRawString(int position, int length)
{
_pdfSteam.Position = position;
byte[] bytes = new byte[length];
_pdfSteam.Read(bytes, 0, length);
return PdfEncoders.RawEncoding.GetString(bytes, 0, bytes.Length);
}
/// <summary>
/// Scans a comment line.
/// </summary>
public Symbol ScanComment()
{
Debug.Assert(_currChar == Chars.Percent);
_token = new StringBuilder();
while (true)
{
char ch = AppendAndScanNextChar();
if (ch == Chars.LF || ch == Chars.EOF)
break;
}
// TODO: not correct
if (_token.ToString().StartsWith("%%EOF"))
return Symbol.Eof;
return _symbol = Symbol.Comment;
}
/// <summary>
/// Scans a name.
/// </summary>
public Symbol ScanName()
{
Debug.Assert(_currChar == Chars.Slash);
_token = new StringBuilder();
while (true)
{
char ch = AppendAndScanNextChar();
if (IsWhiteSpace(ch) || IsDelimiter(ch) || ch == Chars.EOF)
return _symbol = Symbol.Name;
if (ch == '#')
{
ScanNextChar(true);
char[] hex = new char[2];
hex[0] = _currChar;
hex[1] = _nextChar;
ScanNextChar(true);
// TODO Check syntax
ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
_currChar = ch;
}
}
}
/// <summary>
/// Scans a number.
/// </summary>
public Symbol ScanNumber()
{
// I found a PDF file created with Acrobat 7 with this entry
// /Checksum 2996984786
// What is this? It is neither an integer nor a real.
// I introduced an UInteger...
bool period = false;
//bool sign;
_token = new StringBuilder();
char ch = _currChar;
if (ch == '+' || ch == '-')
{
//sign = true;
_token.Append(ch);
ch = ScanNextChar(true);
}
while (true)
{
if (char.IsDigit(ch))
{
_token.Append(ch);
}
else if (ch == '.')
{
if (period)
ParserDiagnostics.ThrowParserException("More than one period in number.");
period = true;
_token.Append(ch);
}
else
break;
ch = ScanNextChar(true);
}
if (period)
return Symbol.Real;
long l = Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture);
if (l >= Int32.MinValue && l <= Int32.MaxValue)
return Symbol.Integer;
if (l > 0 && l <= UInt32.MaxValue)
return Symbol.UInteger;
// Got an AutoCAD PDF file that contains this: /C 264584027963392
// Best we can do is to convert it to real value.
return Symbol.Real;
//thr ow new PdfReaderException("Number exceeds integer range.");
}
public Symbol ScanNumberOrReference()
{
Symbol result = ScanNumber();
if (result == Symbol.Integer)
{
int pos = Position;
string objectNumber = Token;
}
return result;
}
/// <summary>
/// Scans a keyword.
/// </summary>
public Symbol ScanKeyword()
{
_token = new StringBuilder();
char ch = _currChar;
// Scan token
while (true)
{
if (char.IsLetter(ch))
_token.Append(ch);
else
break;
ch = ScanNextChar(false);
}
// Check known tokens.
switch (_token.ToString())
{
case "obj":
return _symbol = Symbol.Obj;
case "endobj":
return _symbol = Symbol.EndObj;
case "null":
return _symbol = Symbol.Null;
case "true":
case "false":
return _symbol = Symbol.Boolean;
case "R":
return _symbol = Symbol.R;
case "stream":
return _symbol = Symbol.BeginStream;
case "endstream":
return _symbol = Symbol.EndStream;
case "xref":
return _symbol = Symbol.XRef;
case "trailer":
return _symbol = Symbol.Trailer;
case "startxref":
return _symbol = Symbol.StartXRef;
}
// Anything else is treated as a keyword. Samples are f or n in iref.
return _symbol = Symbol.Keyword;
}
/// <summary>
/// Scans a literal string, contained between "(" and ")".
/// </summary>
public Symbol ScanLiteralString()
{
// Reference: 3.2.3 String Objects / Page 53
// Reference: TABLE 3.32 String Types / Page 157
Debug.Assert(_currChar == Chars.ParenLeft);
_token = new StringBuilder();
int parenLevel = 0;
char ch = ScanNextChar(false);
// Phase 1: deal with escape characters.
while (ch != Chars.EOF)
{
switch (ch)
{
case '(':
parenLevel++;
break;
case ')':
if (parenLevel == 0)
{
ScanNextChar(false);
// Is goto evil? We could move Phase 2 code here or create a subroutine for Phase 1.
goto Phase2;
}
parenLevel--;
break;
case '\\':
{
ch = ScanNextChar(false);
switch (ch)
{
case 'n':
ch = Chars.LF;
break;
case 'r':
ch = Chars.CR;
break;
case 't':
ch = Chars.HT;
break;
case 'b':
ch = Chars.BS;
break;
case 'f':
ch = Chars.FF;
break;
case '(':
ch = Chars.ParenLeft;
break;
case ')':
ch = Chars.ParenRight;
break;
case '\\':
ch = Chars.BackSlash;
break;
// AutoCAD PDFs my contain such strings: (\ )
case ' ':
ch = ' ';
break;
case Chars.CR:
case Chars.LF:
ch = ScanNextChar(false);
continue;
default:
// TODO IsOctalDigit(ch).
if (char.IsDigit(ch) && _nextChar != '8' && _nextChar != '9') // First octal character.
{
//// Octal character code.
//if (ch >= '8')
// ParserDiagnostics.HandleUnexpectedCharacter(ch);
int n = ch - '0';
if (char.IsDigit(_nextChar) && _nextChar != '8' && _nextChar != '9') // Second octal character.
{
ch = ScanNextChar(false);
//if (ch >= '8')
// ParserDiagnostics.HandleUnexpectedCharacter(ch);
n = n * 8 + ch - '0';
if (char.IsDigit(_nextChar) && _nextChar != '8' && _nextChar != '9') // Third octal character.
{
ch = ScanNextChar(false);
//if (ch >= '8')
// ParserDiagnostics.HandleUnexpectedCharacter(ch);
n = n * 8 + ch - '0';
}
}
ch = (char)n;
}
else
{
// PDF 32000: "If the character following the REVERSE SOLIDUS is not one of those shown in Table 3, the REVERSE SOLIDUS shall be ignored."
//TODO
// Debug.As sert(false, "Not implemented; unknown escape character.");
// ParserDiagnostics.HandleUnexpectedCharacter(ch);
//GetType();
}
break;
}
break;
}
default:
break;
}
_token.Append(ch);
ch = ScanNextChar(false);
}
// Phase 2: deal with UTF-16BE if necessary.
// UTF-16BE Unicode strings start with U+FEFF ("<22><>"). There can be empty strings with UTF-16BE prefix.
Phase2:
if (_token.Length >= 2 && _token[0] == '\xFE' && _token[1] == '\xFF')
{
// Combine two ANSI characters to get one Unicode character.
StringBuilder temp = _token;
int length = temp.Length;
if ((length & 1) == 1)
{
// TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
temp.Append(0);
++length;
DebugBreak.Break();
}
_token = new StringBuilder();
for (int i = 2; i < length; i += 2)
{
_token.Append((char)(256 * temp[i] + temp[i + 1]));
}
return _symbol = Symbol.UnicodeString;
}
// Adobe Reader also supports UTF-16LE.
if (_token.Length >= 2 && _token[0] == '\xFF' && _token[1] == '\xFE')
{
// Combine two ANSI characters to get one Unicode character.
StringBuilder temp = _token;
int length = temp.Length;
if ((length & 1) == 1)
{
// TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
temp.Append(0);
++length;
DebugBreak.Break();
}
_token = new StringBuilder();
for (int i = 2; i < length; i += 2)
{
_token.Append((char)(256 * temp[i + 1] + temp[i]));
}
return _symbol = Symbol.UnicodeString;
}
return _symbol = Symbol.String;
}
public Symbol ScanHexadecimalString()
{
Debug.Assert(_currChar == Chars.Less);
_token = new StringBuilder();
char[] hex = new char[2];
ScanNextChar(true);
while (true)
{
MoveToNonWhiteSpace();
if (_currChar == '>')
{
ScanNextChar(true);
break;
}
if (char.IsLetterOrDigit(_currChar))
{
hex[0] = char.ToUpper(_currChar);
// Second char is optional in PDF spec.
if (char.IsLetterOrDigit(_nextChar))
{
hex[1] = char.ToUpper(_nextChar);
ScanNextChar(true);
}
else
{
// We could check for ">" here and throw if we find anything else. The throw comes after the next iteration anyway.
hex[1] = '0';
}
ScanNextChar(true);
int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
_token.Append(Convert.ToChar(ch));
}
else
ParserDiagnostics.HandleUnexpectedCharacter(_currChar);
}
string chars = _token.ToString();
int count = chars.Length;
if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF)
{
Debug.Assert(count % 2 == 0);
_token.Length = 0;
for (int idx = 2; idx < count; idx += 2)
_token.Append((char)(chars[idx] * 256 + chars[idx + 1]));
return _symbol = Symbol.UnicodeHexString;
}
return _symbol = Symbol.HexString;
}
/// <summary>
/// Move current position one character further in PDF stream.
/// </summary>
internal char ScanNextChar(bool handleCRLF)
{
if (_pdfLength <= _idxChar)
{
_currChar = Chars.EOF;
_nextChar = Chars.EOF;
}
else
{
_currChar = _nextChar;
_nextChar = (char)_pdfSteam.ReadByte();
_idxChar++;
if (handleCRLF && _currChar == Chars.CR)
{
if (_nextChar == Chars.LF)
{
// Treat CR LF as LF.
_currChar = _nextChar;
_nextChar = (char)_pdfSteam.ReadByte();
_idxChar++;
}
else
{
// Treat single CR as LF.
_currChar = Chars.LF;
}
}
}
return _currChar;
}
///// <summary>
///// Resets the current token to the empty string.
///// </summary>
//void ClearToken()
//{
// _token.Length = 0;
//}
bool PeekReference()
{
// A Reference has the form "nnn mmm R". The implementation of the parser used a
// reduce/shift algorithm in the first place. But this case is the only one we need to
// look ahead 3 tokens.
int positon = Position;
// Skip digits.
while (char.IsDigit(_currChar))
ScanNextChar(true);
// Space expected.
if (_currChar != Chars.SP)
goto False;
// Skip spaces.
while (_currChar == Chars.SP)
ScanNextChar(true);
// Digit expected.
if (!char.IsDigit(_currChar))
goto False;
// Skip digits.
while (char.IsDigit(_currChar))
ScanNextChar(true);
// Space expected.
if (_currChar != Chars.SP)
goto False;
// Skip spaces.
while (_currChar == Chars.SP)
ScanNextChar(true);
// "R" expected.
// We can ignore _nextChar because there is no other valid token that starts with an 'R'.
if (_currChar != 'R')
goto False;
Position = positon;
return true;
False:
Position = positon;
return false;
}
/// <summary>
/// Appends current character to the token and reads next one.
/// </summary>
internal char AppendAndScanNextChar()
{
if (_currChar == Chars.EOF)
ParserDiagnostics.ThrowParserException("Undetected EOF reached.");
_token.Append(_currChar);
return ScanNextChar(true);
}
/// <summary>
/// If the current character is not a white space, the function immediately returns it.
/// Otherwise the PDF cursor is moved forward to the first non-white space or EOF.
/// White spaces are NUL, HT, LF, FF, CR, and SP.
/// </summary>
public char MoveToNonWhiteSpace()
{
while (_currChar != Chars.EOF)
{
switch (_currChar)
{
case Chars.NUL:
case Chars.HT:
case Chars.LF:
case Chars.FF:
case Chars.CR:
case Chars.SP:
ScanNextChar(true);
break;
case (char)11:
case (char)173:
ScanNextChar(true);
break;
default:
return _currChar;
}
}
return _currChar;
}
#if DEBUG
public string SurroundingsOfCurrentPosition(bool hex)
{
const int range = 20;
int start = Math.Max(Position - range, 0);
int length = Math.Min(2 * range, PdfLength - start);
long posOld = _pdfSteam.Position;
_pdfSteam.Position = start;
byte[] bytes = new byte[length];
_pdfSteam.Read(bytes, 0, length);
_pdfSteam.Position = posOld;
string result = "";
if (hex)
{
for (int idx = 0; idx < length; idx++)
result += ((int)bytes[idx]).ToString("x2");
//result += string.Format("{0:", (int) bytes[idx]);
}
else
{
for (int idx = 0; idx < length; idx++)
result += (char)bytes[idx];
}
return result;
}
#endif
/// <summary>
/// Gets the current symbol.
/// </summary>
public Symbol Symbol
{
get { return _symbol; }
set { _symbol = value; }
}
/// <summary>
/// Gets the current token.
/// </summary>
public string Token
{
get { return _token.ToString(); }
}
/// <summary>
/// Interprets current token as boolean literal.
/// </summary>
public bool TokenToBoolean
{
get
{
Debug.Assert(_token.ToString() == "true" || _token.ToString() == "false");
return _token.ToString()[0] == 't';
}
}
/// <summary>
/// Interprets current token as integer literal.
/// </summary>
public int TokenToInteger
{
get
{
//Debug.As sert(_token.ToString().IndexOf('.') == -1);
return int.Parse(_token.ToString(), CultureInfo.InvariantCulture);
}
}
/// <summary>
/// Interprets current token as unsigned integer literal.
/// </summary>
public uint TokenToUInteger
{
get
{
//Debug.As sert(_token.ToString().IndexOf('.') == -1);
return uint.Parse(_token.ToString(), CultureInfo.InvariantCulture);
}
}
/// <summary>
/// Interprets current token as real or integer literal.
/// </summary>
public double TokenToReal
{
get { return double.Parse(_token.ToString(), CultureInfo.InvariantCulture); }
}
/// <summary>
/// Interprets current token as object ID.
/// </summary>
public PdfObjectID TokenToObjectID
{
get
{
string[] numbers = Token.Split('|');
int objectNumber = Int32.Parse(numbers[0]);
int generationNumber = Int32.Parse(numbers[1]);
return new PdfObjectID(objectNumber, generationNumber);
}
}
/// <summary>
/// Indicates whether the specified character is a PDF white-space character.
/// </summary>
internal static bool IsWhiteSpace(char ch)
{
switch (ch)
{
case Chars.NUL: // 0 Null
case Chars.HT: // 9 Horizontal Tab
case Chars.LF: // 10 Line Feed
case Chars.FF: // 12 Form Feed
case Chars.CR: // 13 Carriage Return
case Chars.SP: // 32 Space
return true;
}
return false;
}
/// <summary>
/// Indicates whether the specified character is a PDF delimiter character.
/// </summary>
internal static bool IsDelimiter(char ch)
{
switch (ch)
{
case '(':
case ')':
case '<':
case '>':
case '[':
case ']':
case '{':
case '}':
case '/':
case '%':
return true;
}
return false;
}
/// <summary>
/// Gets the length of the PDF output.
/// </summary>
public int PdfLength
{
get { return _pdfLength; }
}
readonly int _pdfLength;
int _idxChar;
char _currChar;
char _nextChar;
StringBuilder _token;
Symbol _symbol = Symbol.None;
readonly Stream _pdfSteam;
}
}