HTML Tidy  0.1
lexer.h
00001 #ifndef __LEXER_H__
00002 #define __LEXER_H__
00003 
00004 /* lexer.h -- Lexer for html parser
00005   
00006    (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
00007    See tidy.h for the copyright notice.
00008   
00009    CVS Info:
00010     $Author: arnaud02 $ 
00011     $Date: 2007/01/17 10:51:18 $ 
00012     $Revision: 1.40 $ 
00013 
00014 */
00015 
00016 /*
00017   Given an input source, it returns a sequence of tokens.
00018 
00019      GetToken(source) gets the next token
00020      UngetToken(source) provides one level undo
00021 
00022   The tags include an attribute list:
00023 
00024     - linked list of attribute/value nodes
00025     - each node has 2 NULL-terminated strings.
00026     - entities are replaced in attribute values
00027 
00028   white space is compacted if not in preformatted mode
00029   If not in preformatted mode then leading white space
00030   is discarded and subsequent white space sequences
00031   compacted to single space characters.
00032 
00033   If XmlTags is no then Tag names are folded to upper
00034   case and attribute names to lower case.
00035 
00036  Not yet done:
00037     -   Doctype subset and marked sections
00038 */
00039 
00040 #ifdef __cplusplus
00041 extern "C" {
00042 #endif
00043 
00044 #include "forward.h"
00045 
00046 /* lexer character types
00047 */
00048 #define digit       1u
00049 #define letter      2u
00050 #define namechar    4u
00051 #define white       8u
00052 #define newline     16u
00053 #define lowercase   32u
00054 #define uppercase   64u
00055 #define digithex    128u
00056 
00057 
00058 /* node->type is one of these values
00059 */
00060 typedef enum
00061 {
00062   RootNode,
00063   DocTypeTag,
00064   CommentTag,
00065   ProcInsTag,
00066   TextNode,
00067   StartTag,
00068   EndTag,
00069   StartEndTag,
00070   CDATATag,
00071   SectionTag,
00072   AspTag,
00073   JsteTag,
00074   PhpTag,
00075   XmlDecl
00076 } NodeType;
00077 
00078 
00079 
00080 /* lexer GetToken states
00081 */
00082 typedef enum
00083 {
00084   LEX_CONTENT,
00085   LEX_GT,
00086   LEX_ENDTAG,
00087   LEX_STARTTAG,
00088   LEX_COMMENT,
00089   LEX_DOCTYPE,
00090   LEX_PROCINSTR,
00091   LEX_ENDCOMMENT,
00092   LEX_CDATA,
00093   LEX_SECTION,
00094   LEX_ASP,
00095   LEX_JSTE,
00096   LEX_PHP,
00097   LEX_XMLDECL
00098 } LexerState;
00099 
00100 /* ParseDocTypeDecl state constants */
00101 typedef enum
00102 {
00103   DT_INTERMEDIATE,
00104   DT_DOCTYPENAME,
00105   DT_PUBLICSYSTEM,
00106   DT_QUOTEDSTRING,
00107   DT_INTSUBSET
00108 } ParseDocTypeDeclState;
00109 
00110 /* content model shortcut encoding
00111 
00112    Descriptions are tentative.
00113 */
00114 #define CM_UNKNOWN      0
00115 /* Elements with no content. Map to HTML specification. */
00116 #define CM_EMPTY        (1 << 0)
00117 /* Elements that appear outside of "BODY". */
00118 #define CM_HTML         (1 << 1)
00119 /* Elements that can appear within HEAD. */
00120 #define CM_HEAD         (1 << 2)
00121 /* HTML "block" elements. */
00122 #define CM_BLOCK        (1 << 3)
00123 /* HTML "inline" elements. */
00124 #define CM_INLINE       (1 << 4)
00125 /* Elements that mark list item ("LI"). */
00126 #define CM_LIST         (1 << 5)
00127 /* Elements that mark definition list item ("DL", "DT"). */
00128 #define CM_DEFLIST      (1 << 6)
00129 /* Elements that can appear inside TABLE. */
00130 #define CM_TABLE        (1 << 7)
00131 /* Used for "THEAD", "TFOOT" or "TBODY". */
00132 #define CM_ROWGRP       (1 << 8)
00133 /* Used for "TD", "TH" */
00134 #define CM_ROW          (1 << 9)
00135 /* Elements whose content must be protected against white space movement.
00136    Includes some elements that can found in forms. */
00137 #define CM_FIELD        (1 << 10)
00138 /* Used to avoid propagating inline emphasis inside some elements
00139    such as OBJECT or APPLET. */
00140 #define CM_OBJECT       (1 << 11)
00141 /* Elements that allows "PARAM". */
00142 #define CM_PARAM        (1 << 12)
00143 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
00144 #define CM_FRAMES       (1 << 13)
00145 /* Heading elements (h1, h2, ...). */
00146 #define CM_HEADING      (1 << 14)
00147 /* Elements with an optional end tag. */
00148 #define CM_OPT          (1 << 15)
00149 /* Elements that use "align" attribute for vertical position. */
00150 #define CM_IMG          (1 << 16)
00151 /* Elements with inline and block model. Used to avoid calling InlineDup. */
00152 #define CM_MIXED        (1 << 17)
00153 /* Elements whose content needs to be indented only if containing one 
00154    CM_BLOCK element. */
00155 #define CM_NO_INDENT    (1 << 18)
00156 /* Elements that are obsolete (such as "dir", "menu"). */
00157 #define CM_OBSOLETE     (1 << 19)
00158 /* User defined elements. Used to determine how attributes wihout value
00159    should be printed. */
00160 #define CM_NEW          (1 << 20)
00161 /* Elements that cannot be omitted. */
00162 #define CM_OMITST       (1 << 21)
00163 
00164 /* If the document uses just HTML 2.0 tags and attributes described
00165 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
00166 ** If there are proprietary tags and attributes then describe it as
00167 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
00168 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
00169 ** flavors of Voyager (strict, loose or frameset).
00170 */
00171 
00172 /* unknown */
00173 #define xxxx                   0u
00174 
00175 /* W3C defined HTML/XHTML family document types */
00176 #define HT20                   1u
00177 #define HT32                   2u
00178 #define H40S                   4u
00179 #define H40T                   8u
00180 #define H40F                  16u
00181 #define H41S                  32u
00182 #define H41T                  64u
00183 #define H41F                 128u
00184 #define X10S                 256u
00185 #define X10T                 512u
00186 #define X10F                1024u
00187 #define XH11                2048u
00188 #define XB10                4096u
00189 
00190 /* proprietary stuff */
00191 #define VERS_SUN            8192u
00192 #define VERS_NETSCAPE      16384u
00193 #define VERS_MICROSOFT     32768u
00194 
00195 /* special flag */
00196 #define VERS_XML           65536u
00197 
00198 /* compatibility symbols */
00199 #define VERS_UNKNOWN       (xxxx)
00200 #define VERS_HTML20        (HT20)
00201 #define VERS_HTML32        (HT32)
00202 #define VERS_HTML40_STRICT (H40S|H41S|X10S)
00203 #define VERS_HTML40_LOOSE  (H40T|H41T|X10T)
00204 #define VERS_FRAMESET      (H40F|H41F|X10F)
00205 #define VERS_XHTML11       (XH11)
00206 #define VERS_BASIC         (XB10)
00207 
00208 /* meta symbols */
00209 #define VERS_HTML40        (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
00210 #define VERS_IFRAME        (VERS_HTML40_LOOSE|VERS_FRAMESET)
00211 #define VERS_LOOSE         (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
00212 #define VERS_EVENTS        (VERS_HTML40|VERS_XHTML11)
00213 #define VERS_FROM32        (VERS_HTML32|VERS_HTML40)
00214 #define VERS_FROM40        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
00215 #define VERS_XHTML         (X10S|X10T|X10F|XH11|XB10)
00216 
00217 /* all W3C defined document types */
00218 #define VERS_ALL           (VERS_HTML20|VERS_HTML32|VERS_FROM40)
00219 
00220 /* all proprietary types */
00221 #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
00222 
00223 /* Linked list of class names and styles
00224 */
00225 struct _Style;
00226 typedef struct _Style TagStyle;
00227 
00228 struct _Style
00229 {
00230     tmbstr tag;
00231     tmbstr tag_class;
00232     tmbstr properties;
00233     TagStyle *next;
00234 };
00235 
00236 
00237 /* Linked list of style properties
00238 */
00239 struct _StyleProp;
00240 typedef struct _StyleProp StyleProp;
00241 
00242 struct _StyleProp
00243 {
00244     tmbstr name;
00245     tmbstr value;
00246     StyleProp *next;
00247 };
00248 
00249 
00250 
00251 
00252 /* Attribute/Value linked list node
00253 */
00254 
00255 struct _AttVal
00256 {
00257     AttVal*           next;
00258     const Attribute*  dict;
00259     Node*             asp;
00260     Node*             php;
00261     int               delim;
00262     tmbstr            attribute;
00263     tmbstr            value;
00264 };
00265 
00266 
00267 
00268 /*
00269   Mosaic handles inlines via a separate stack from other elements
00270   We duplicate this to recover from inline markup errors such as:
00271 
00272      <i>italic text
00273      <p>more italic text</b> normal text
00274 
00275   which for compatibility with Mosaic is mapped to:
00276 
00277      <i>italic text</i>
00278      <p><i>more italic text</i> normal text
00279 
00280   Note that any inline end tag pop's the effect of the current
00281   inline start tag, so that </b> pop's <i> in the above example.
00282 */
00283 struct _IStack
00284 {
00285     IStack*     next;
00286     const Dict* tag;        /* tag's dictionary definition */
00287     tmbstr      element;    /* name (NULL for text nodes) */
00288     AttVal*     attributes;
00289 };
00290 
00291 
00292 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
00293 ** etc. etc.
00294 */
00295 
00296 struct _Node
00297 {
00298     Node*       parent;         /* tree structure */
00299     Node*       prev;
00300     Node*       next;
00301     Node*       content;
00302     Node*       last;
00303 
00304     AttVal*     attributes;
00305     const Dict* was;            /* old tag when it was changed */
00306     const Dict* tag;            /* tag's dictionary definition */
00307 
00308     tmbstr      element;        /* name (NULL for text nodes) */
00309 
00310     uint        start;          /* start of span onto text array */
00311     uint        end;            /* end of span onto text array */
00312     NodeType    type;           /* TextNode, StartTag, EndTag etc. */
00313 
00314     uint        line;           /* current line of document */
00315     uint        column;         /* current column of document */
00316 
00317     Bool        closed;         /* true if closed by explicit end tag */
00318     Bool        implicit;       /* true if inferred */
00319     Bool        linebreak;      /* true if followed by a line break */
00320 
00321 #ifdef TIDY_STORE_ORIGINAL_TEXT
00322     tmbstr      otext;
00323 #endif
00324 };
00325 
00326 
00327 /*
00328   The following are private to the lexer
00329   Use NewLexer() to create a lexer, and
00330   FreeLexer() to free it.
00331 */
00332 
00333 struct _Lexer
00334 {
00335 #if 0  /* Move to TidyDocImpl */
00336     StreamIn* in;           /* document content input */
00337     StreamOut* errout;      /* error output stream */
00338 
00339     uint badAccess;         /* for accessibility errors */
00340     uint badLayout;         /* for bad style errors */
00341     uint badChars;          /* for bad character encodings */
00342     uint badForm;           /* for mismatched/mispositioned form tags */
00343     uint warnings;          /* count of warnings in this document */
00344     uint errors;            /* count of errors */
00345 #endif
00346 
00347     uint lines;             /* lines seen */
00348     uint columns;           /* at start of current token */
00349     Bool waswhite;          /* used to collapse contiguous white space */
00350     Bool pushed;            /* true after token has been pushed back */
00351     Bool insertspace;       /* when space is moved after end tag */
00352     Bool excludeBlocks;     /* Netscape compatibility */
00353     Bool exiled;            /* true if moved out of table */
00354     Bool isvoyager;         /* true if xmlns attribute on html element */
00355     uint versions;          /* bit vector of HTML versions */
00356     uint doctype;           /* version as given by doctype (if any) */
00357     uint versionEmitted;    /* version of doctype emitted */
00358     Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
00359     uint txtstart;          /* start of current node */
00360     uint txtend;            /* end of current node */
00361     LexerState state;       /* state of lexer's finite state machine */
00362 
00363     Node* token;            /* last token returned by GetToken() */
00364     Node* itoken;           /* last duplicate inline returned by GetToken() */
00365     Node* root;             /* remember root node of the document */
00366     Node* parent;           /* remember parent node for CDATA elements */
00367     
00368     Bool seenEndBody;       /* true if a </body> tag has been encountered */
00369     Bool seenEndHtml;       /* true if a </html> tag has been encountered */
00370 
00371     /*
00372       Lexer character buffer
00373 
00374       Parse tree nodes span onto this buffer
00375       which contains the concatenated text
00376       contents of all of the elements.
00377 
00378       lexsize must be reset for each file.
00379     */
00380     tmbstr lexbuf;          /* MB character buffer */
00381     uint lexlength;         /* allocated */
00382     uint lexsize;           /* used */
00383 
00384     /* Inline stack for compatibility with Mosaic */
00385     Node* inode;            /* for deferring text node */
00386     IStack* insert;         /* for inferring inline tags */
00387     IStack* istack;
00388     uint istacklength;      /* allocated */
00389     uint istacksize;        /* used */
00390     uint istackbase;        /* start of frame */
00391 
00392     TagStyle *styles;          /* used for cleaning up presentation markup */
00393 
00394     TidyAllocator* allocator; /* allocator */
00395 
00396 #if 0
00397     TidyDocImpl* doc;       /* Pointer back to doc for error reporting */
00398 #endif 
00399 };
00400 
00401 
00402 /* Lexer Functions
00403 */
00404 
00405 /* choose what version to use for new doctype */
00406 int TY_(HTMLVersion)( TidyDocImpl* doc );
00407 
00408 /* everything is allowed in proprietary version of HTML */
00409 /* this is handled here rather than in the tag/attr dicts */
00410 
00411 void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
00412 
00413 Bool TY_(IsWhite)(uint c);
00414 Bool TY_(IsDigit)(uint c);
00415 Bool TY_(IsLetter)(uint c);
00416 Bool TY_(IsNewline)(uint c);
00417 Bool TY_(IsNamechar)(uint c);
00418 Bool TY_(IsXMLLetter)(uint c);
00419 Bool TY_(IsXMLNamechar)(uint c);
00420 
00421 /* Bool IsLower(uint c); */
00422 Bool TY_(IsUpper)(uint c);
00423 uint TY_(ToLower)(uint c);
00424 uint TY_(ToUpper)(uint c);
00425 
00426 Lexer* TY_(NewLexer)( TidyDocImpl* doc );
00427 void TY_(FreeLexer)( TidyDocImpl* doc );
00428 
00429 /* store character c as UTF-8 encoded byte stream */
00430 void TY_(AddCharToLexer)( Lexer *lexer, uint c );
00431 
00432 /*
00433   Used for elements and text nodes
00434   element name is NULL for text nodes
00435   start and end are offsets into lexbuf
00436   which contains the textual content of
00437   all elements in the parse tree.
00438 
00439   parent and content allow traversal
00440   of the parse tree in any direction.
00441   attributes are represented as a linked
00442   list of AttVal nodes which hold the
00443   strings for attribute/value pairs.
00444 */
00445 Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
00446 
00447 
00448 /* used to clone heading nodes when split by an <HR> */
00449 Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
00450 
00451 /* free node's attributes */
00452 void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
00453 
00454 /* doesn't repair attribute list linkage */
00455 void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
00456 
00457 /* detach attribute from node */
00458 void TY_(DetachAttribute)( Node *node, AttVal *attr );
00459 
00460 /* detach attribute from node then free it
00461 */
00462 void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
00463 
00464 /*
00465   Free document nodes by iterating through peers and recursing
00466   through children. Set next to NULL before calling FreeNode()
00467   to avoid freeing peer nodes. Doesn't patch up prev/next links.
00468  */
00469 void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
00470 
00471 Node* TY_(TextToken)( Lexer *lexer );
00472 
00473 /* used for creating preformatted text from Word2000 */
00474 Node* TY_(NewLineNode)( Lexer *lexer );
00475 
00476 /* used for adding a &nbsp; for Word2000 */
00477 Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
00478 
00479 void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
00480 /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
00481 
00482 /* find element */
00483 Node* TY_(FindDocType)( TidyDocImpl* doc );
00484 Node* TY_(FindHTML)( TidyDocImpl* doc );
00485 Node* TY_(FindHEAD)( TidyDocImpl* doc );
00486 Node* TY_(FindTITLE)(TidyDocImpl* doc);
00487 Node* TY_(FindBody)( TidyDocImpl* doc );
00488 Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
00489 
00490 /* Returns containing block element, if any */
00491 Node* TY_(FindContainer)( Node* node );
00492 
00493 /* add meta element for Tidy */
00494 Bool TY_(AddGenerator)( TidyDocImpl* doc );
00495 
00496 uint TY_(ApparentVersion)( TidyDocImpl* doc );
00497 
00498 ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
00499 
00500 Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
00501 
00502 Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
00503 
00504 
00505 /* fixup doctype if missing */
00506 Bool TY_(FixDocType)( TidyDocImpl* doc );
00507 
00508 /* ensure XML document starts with <?xml version="1.0"?> */
00509 /* add encoding attribute if not using ASCII or UTF-8 output */
00510 Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
00511 
00512 Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
00513 
00514 void TY_(UngetToken)( TidyDocImpl* doc );
00515 
00516 
00517 /*
00518   modes for GetToken()
00519 
00520   MixedContent   -- for elements which don't accept PCDATA
00521   Preformatted   -- white space preserved as is
00522   IgnoreMarkup   -- for CDATA elements such as script, style
00523 */
00524 typedef enum
00525 {
00526   IgnoreWhitespace,
00527   MixedContent,
00528   Preformatted,
00529   IgnoreMarkup,
00530   CdataContent
00531 } GetTokenMode;
00532 
00533 Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
00534 
00535 void TY_(InitMap)(void);
00536 
00537 
00538 /* create a new attribute */
00539 AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
00540 
00541 /* create a new attribute with given name and value */
00542 AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
00543                              int delim );
00544 
00545 /* insert attribute at the end of attribute list of a node */
00546 void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
00547 
00548 /* insert attribute at the start of attribute list of a node */
00549 void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
00550 
00551 /*************************************
00552   In-line Stack functions
00553 *************************************/
00554 
00555 
00556 /* duplicate attributes */
00557 AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
00558 
00559 /*
00560   push a copy of an inline node onto stack
00561   but don't push if implicit or OBJECT or APPLET
00562   (implicit tags are ones generated from the istack)
00563 
00564   One issue arises with pushing inlines when
00565   the tag is already pushed. For instance:
00566 
00567       <p><em>text
00568       <p><em>more text
00569 
00570   Shouldn't be mapped to
00571 
00572       <p><em>text</em></p>
00573       <p><em><em>more text</em></em>
00574 */
00575 void TY_(PushInline)( TidyDocImpl* doc, Node* node );
00576 
00577 /* pop inline stack */
00578 void TY_(PopInline)( TidyDocImpl* doc, Node* node );
00579 
00580 Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
00581 Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
00582 
00583 /*
00584   This has the effect of inserting "missing" inline
00585   elements around the contents of blocklevel elements
00586   such as P, TD, TH, DIV, PRE etc. This procedure is
00587   called at the start of ParseBlock. when the inline
00588   stack is not empty, as will be the case in:
00589 
00590     <i><h1>italic heading</h1></i>
00591 
00592   which is then treated as equivalent to
00593 
00594     <h1><i>italic heading</i></h1>
00595 
00596   This is implemented by setting the lexer into a mode
00597   where it gets tokens from the inline stack rather than
00598   from the input stream.
00599 */
00600 int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
00601 
00602 /*
00603  defer duplicates when entering a table or other
00604  element where the inlines shouldn't be duplicated
00605 */
00606 void TY_(DeferDup)( TidyDocImpl* doc );
00607 Node* TY_(InsertedToken)( TidyDocImpl* doc );
00608 
00609 /* stack manipulation for inline elements */
00610 Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
00611 Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
00612 
00613 #ifdef __cplusplus
00614 }
00615 #endif
00616 
00617 
00618 #endif /* __LEXER_H__ */