HTML Tidy
0.1
|
00001 #ifndef __LEXER_H__ 00002 #define __LEXER_H__ 00003 00004 /* lexer.h -- Lexer for html parser 00005 00006 (c) 1998-2007 (W3C) MIT, ERCIM, Keio University 00007 See tidy.h for the copyright notice. 00008 00009 CVS Info: 00010 $Author: arnaud02 $ 00011 $Date: 2007/01/17 10:51:18 $ 00012 $Revision: 1.40 $ 00013 00014 */ 00015 00016 /* 00017 Given an input source, it returns a sequence of tokens. 00018 00019 GetToken(source) gets the next token 00020 UngetToken(source) provides one level undo 00021 00022 The tags include an attribute list: 00023 00024 - linked list of attribute/value nodes 00025 - each node has 2 NULL-terminated strings. 00026 - entities are replaced in attribute values 00027 00028 white space is compacted if not in preformatted mode 00029 If not in preformatted mode then leading white space 00030 is discarded and subsequent white space sequences 00031 compacted to single space characters. 00032 00033 If XmlTags is no then Tag names are folded to upper 00034 case and attribute names to lower case. 00035 00036 Not yet done: 00037 - Doctype subset and marked sections 00038 */ 00039 00040 #ifdef __cplusplus 00041 extern "C" { 00042 #endif 00043 00044 #include "forward.h" 00045 00046 /* lexer character types 00047 */ 00048 #define digit 1u 00049 #define letter 2u 00050 #define namechar 4u 00051 #define white 8u 00052 #define newline 16u 00053 #define lowercase 32u 00054 #define uppercase 64u 00055 #define digithex 128u 00056 00057 00058 /* node->type is one of these values 00059 */ 00060 typedef enum 00061 { 00062 RootNode, 00063 DocTypeTag, 00064 CommentTag, 00065 ProcInsTag, 00066 TextNode, 00067 StartTag, 00068 EndTag, 00069 StartEndTag, 00070 CDATATag, 00071 SectionTag, 00072 AspTag, 00073 JsteTag, 00074 PhpTag, 00075 XmlDecl 00076 } NodeType; 00077 00078 00079 00080 /* lexer GetToken states 00081 */ 00082 typedef enum 00083 { 00084 LEX_CONTENT, 00085 LEX_GT, 00086 LEX_ENDTAG, 00087 LEX_STARTTAG, 00088 LEX_COMMENT, 00089 LEX_DOCTYPE, 00090 LEX_PROCINSTR, 00091 LEX_ENDCOMMENT, 00092 LEX_CDATA, 00093 LEX_SECTION, 00094 LEX_ASP, 00095 LEX_JSTE, 00096 LEX_PHP, 00097 LEX_XMLDECL 00098 } LexerState; 00099 00100 /* ParseDocTypeDecl state constants */ 00101 typedef enum 00102 { 00103 DT_INTERMEDIATE, 00104 DT_DOCTYPENAME, 00105 DT_PUBLICSYSTEM, 00106 DT_QUOTEDSTRING, 00107 DT_INTSUBSET 00108 } ParseDocTypeDeclState; 00109 00110 /* content model shortcut encoding 00111 00112 Descriptions are tentative. 00113 */ 00114 #define CM_UNKNOWN 0 00115 /* Elements with no content. Map to HTML specification. */ 00116 #define CM_EMPTY (1 << 0) 00117 /* Elements that appear outside of "BODY". */ 00118 #define CM_HTML (1 << 1) 00119 /* Elements that can appear within HEAD. */ 00120 #define CM_HEAD (1 << 2) 00121 /* HTML "block" elements. */ 00122 #define CM_BLOCK (1 << 3) 00123 /* HTML "inline" elements. */ 00124 #define CM_INLINE (1 << 4) 00125 /* Elements that mark list item ("LI"). */ 00126 #define CM_LIST (1 << 5) 00127 /* Elements that mark definition list item ("DL", "DT"). */ 00128 #define CM_DEFLIST (1 << 6) 00129 /* Elements that can appear inside TABLE. */ 00130 #define CM_TABLE (1 << 7) 00131 /* Used for "THEAD", "TFOOT" or "TBODY". */ 00132 #define CM_ROWGRP (1 << 8) 00133 /* Used for "TD", "TH" */ 00134 #define CM_ROW (1 << 9) 00135 /* Elements whose content must be protected against white space movement. 00136 Includes some elements that can found in forms. */ 00137 #define CM_FIELD (1 << 10) 00138 /* Used to avoid propagating inline emphasis inside some elements 00139 such as OBJECT or APPLET. */ 00140 #define CM_OBJECT (1 << 11) 00141 /* Elements that allows "PARAM". */ 00142 #define CM_PARAM (1 << 12) 00143 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ 00144 #define CM_FRAMES (1 << 13) 00145 /* Heading elements (h1, h2, ...). */ 00146 #define CM_HEADING (1 << 14) 00147 /* Elements with an optional end tag. */ 00148 #define CM_OPT (1 << 15) 00149 /* Elements that use "align" attribute for vertical position. */ 00150 #define CM_IMG (1 << 16) 00151 /* Elements with inline and block model. Used to avoid calling InlineDup. */ 00152 #define CM_MIXED (1 << 17) 00153 /* Elements whose content needs to be indented only if containing one 00154 CM_BLOCK element. */ 00155 #define CM_NO_INDENT (1 << 18) 00156 /* Elements that are obsolete (such as "dir", "menu"). */ 00157 #define CM_OBSOLETE (1 << 19) 00158 /* User defined elements. Used to determine how attributes wihout value 00159 should be printed. */ 00160 #define CM_NEW (1 << 20) 00161 /* Elements that cannot be omitted. */ 00162 #define CM_OMITST (1 << 21) 00163 00164 /* If the document uses just HTML 2.0 tags and attributes described 00165 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 00166 ** If there are proprietary tags and attributes then describe it as 00167 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes 00168 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 00169 ** flavors of Voyager (strict, loose or frameset). 00170 */ 00171 00172 /* unknown */ 00173 #define xxxx 0u 00174 00175 /* W3C defined HTML/XHTML family document types */ 00176 #define HT20 1u 00177 #define HT32 2u 00178 #define H40S 4u 00179 #define H40T 8u 00180 #define H40F 16u 00181 #define H41S 32u 00182 #define H41T 64u 00183 #define H41F 128u 00184 #define X10S 256u 00185 #define X10T 512u 00186 #define X10F 1024u 00187 #define XH11 2048u 00188 #define XB10 4096u 00189 00190 /* proprietary stuff */ 00191 #define VERS_SUN 8192u 00192 #define VERS_NETSCAPE 16384u 00193 #define VERS_MICROSOFT 32768u 00194 00195 /* special flag */ 00196 #define VERS_XML 65536u 00197 00198 /* compatibility symbols */ 00199 #define VERS_UNKNOWN (xxxx) 00200 #define VERS_HTML20 (HT20) 00201 #define VERS_HTML32 (HT32) 00202 #define VERS_HTML40_STRICT (H40S|H41S|X10S) 00203 #define VERS_HTML40_LOOSE (H40T|H41T|X10T) 00204 #define VERS_FRAMESET (H40F|H41F|X10F) 00205 #define VERS_XHTML11 (XH11) 00206 #define VERS_BASIC (XB10) 00207 00208 /* meta symbols */ 00209 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 00210 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 00211 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 00212 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 00213 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 00214 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 00215 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 00216 00217 /* all W3C defined document types */ 00218 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 00219 00220 /* all proprietary types */ 00221 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 00222 00223 /* Linked list of class names and styles 00224 */ 00225 struct _Style; 00226 typedef struct _Style TagStyle; 00227 00228 struct _Style 00229 { 00230 tmbstr tag; 00231 tmbstr tag_class; 00232 tmbstr properties; 00233 TagStyle *next; 00234 }; 00235 00236 00237 /* Linked list of style properties 00238 */ 00239 struct _StyleProp; 00240 typedef struct _StyleProp StyleProp; 00241 00242 struct _StyleProp 00243 { 00244 tmbstr name; 00245 tmbstr value; 00246 StyleProp *next; 00247 }; 00248 00249 00250 00251 00252 /* Attribute/Value linked list node 00253 */ 00254 00255 struct _AttVal 00256 { 00257 AttVal* next; 00258 const Attribute* dict; 00259 Node* asp; 00260 Node* php; 00261 int delim; 00262 tmbstr attribute; 00263 tmbstr value; 00264 }; 00265 00266 00267 00268 /* 00269 Mosaic handles inlines via a separate stack from other elements 00270 We duplicate this to recover from inline markup errors such as: 00271 00272 <i>italic text 00273 <p>more italic text</b> normal text 00274 00275 which for compatibility with Mosaic is mapped to: 00276 00277 <i>italic text</i> 00278 <p><i>more italic text</i> normal text 00279 00280 Note that any inline end tag pop's the effect of the current 00281 inline start tag, so that </b> pop's <i> in the above example. 00282 */ 00283 struct _IStack 00284 { 00285 IStack* next; 00286 const Dict* tag; /* tag's dictionary definition */ 00287 tmbstr element; /* name (NULL for text nodes) */ 00288 AttVal* attributes; 00289 }; 00290 00291 00292 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 00293 ** etc. etc. 00294 */ 00295 00296 struct _Node 00297 { 00298 Node* parent; /* tree structure */ 00299 Node* prev; 00300 Node* next; 00301 Node* content; 00302 Node* last; 00303 00304 AttVal* attributes; 00305 const Dict* was; /* old tag when it was changed */ 00306 const Dict* tag; /* tag's dictionary definition */ 00307 00308 tmbstr element; /* name (NULL for text nodes) */ 00309 00310 uint start; /* start of span onto text array */ 00311 uint end; /* end of span onto text array */ 00312 NodeType type; /* TextNode, StartTag, EndTag etc. */ 00313 00314 uint line; /* current line of document */ 00315 uint column; /* current column of document */ 00316 00317 Bool closed; /* true if closed by explicit end tag */ 00318 Bool implicit; /* true if inferred */ 00319 Bool linebreak; /* true if followed by a line break */ 00320 00321 #ifdef TIDY_STORE_ORIGINAL_TEXT 00322 tmbstr otext; 00323 #endif 00324 }; 00325 00326 00327 /* 00328 The following are private to the lexer 00329 Use NewLexer() to create a lexer, and 00330 FreeLexer() to free it. 00331 */ 00332 00333 struct _Lexer 00334 { 00335 #if 0 /* Move to TidyDocImpl */ 00336 StreamIn* in; /* document content input */ 00337 StreamOut* errout; /* error output stream */ 00338 00339 uint badAccess; /* for accessibility errors */ 00340 uint badLayout; /* for bad style errors */ 00341 uint badChars; /* for bad character encodings */ 00342 uint badForm; /* for mismatched/mispositioned form tags */ 00343 uint warnings; /* count of warnings in this document */ 00344 uint errors; /* count of errors */ 00345 #endif 00346 00347 uint lines; /* lines seen */ 00348 uint columns; /* at start of current token */ 00349 Bool waswhite; /* used to collapse contiguous white space */ 00350 Bool pushed; /* true after token has been pushed back */ 00351 Bool insertspace; /* when space is moved after end tag */ 00352 Bool excludeBlocks; /* Netscape compatibility */ 00353 Bool exiled; /* true if moved out of table */ 00354 Bool isvoyager; /* true if xmlns attribute on html element */ 00355 uint versions; /* bit vector of HTML versions */ 00356 uint doctype; /* version as given by doctype (if any) */ 00357 uint versionEmitted; /* version of doctype emitted */ 00358 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 00359 uint txtstart; /* start of current node */ 00360 uint txtend; /* end of current node */ 00361 LexerState state; /* state of lexer's finite state machine */ 00362 00363 Node* token; /* last token returned by GetToken() */ 00364 Node* itoken; /* last duplicate inline returned by GetToken() */ 00365 Node* root; /* remember root node of the document */ 00366 Node* parent; /* remember parent node for CDATA elements */ 00367 00368 Bool seenEndBody; /* true if a </body> tag has been encountered */ 00369 Bool seenEndHtml; /* true if a </html> tag has been encountered */ 00370 00371 /* 00372 Lexer character buffer 00373 00374 Parse tree nodes span onto this buffer 00375 which contains the concatenated text 00376 contents of all of the elements. 00377 00378 lexsize must be reset for each file. 00379 */ 00380 tmbstr lexbuf; /* MB character buffer */ 00381 uint lexlength; /* allocated */ 00382 uint lexsize; /* used */ 00383 00384 /* Inline stack for compatibility with Mosaic */ 00385 Node* inode; /* for deferring text node */ 00386 IStack* insert; /* for inferring inline tags */ 00387 IStack* istack; 00388 uint istacklength; /* allocated */ 00389 uint istacksize; /* used */ 00390 uint istackbase; /* start of frame */ 00391 00392 TagStyle *styles; /* used for cleaning up presentation markup */ 00393 00394 TidyAllocator* allocator; /* allocator */ 00395 00396 #if 0 00397 TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 00398 #endif 00399 }; 00400 00401 00402 /* Lexer Functions 00403 */ 00404 00405 /* choose what version to use for new doctype */ 00406 int TY_(HTMLVersion)( TidyDocImpl* doc ); 00407 00408 /* everything is allowed in proprietary version of HTML */ 00409 /* this is handled here rather than in the tag/attr dicts */ 00410 00411 void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); 00412 00413 Bool TY_(IsWhite)(uint c); 00414 Bool TY_(IsDigit)(uint c); 00415 Bool TY_(IsLetter)(uint c); 00416 Bool TY_(IsNewline)(uint c); 00417 Bool TY_(IsNamechar)(uint c); 00418 Bool TY_(IsXMLLetter)(uint c); 00419 Bool TY_(IsXMLNamechar)(uint c); 00420 00421 /* Bool IsLower(uint c); */ 00422 Bool TY_(IsUpper)(uint c); 00423 uint TY_(ToLower)(uint c); 00424 uint TY_(ToUpper)(uint c); 00425 00426 Lexer* TY_(NewLexer)( TidyDocImpl* doc ); 00427 void TY_(FreeLexer)( TidyDocImpl* doc ); 00428 00429 /* store character c as UTF-8 encoded byte stream */ 00430 void TY_(AddCharToLexer)( Lexer *lexer, uint c ); 00431 00432 /* 00433 Used for elements and text nodes 00434 element name is NULL for text nodes 00435 start and end are offsets into lexbuf 00436 which contains the textual content of 00437 all elements in the parse tree. 00438 00439 parent and content allow traversal 00440 of the parse tree in any direction. 00441 attributes are represented as a linked 00442 list of AttVal nodes which hold the 00443 strings for attribute/value pairs. 00444 */ 00445 Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer ); 00446 00447 00448 /* used to clone heading nodes when split by an <HR> */ 00449 Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); 00450 00451 /* free node's attributes */ 00452 void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); 00453 00454 /* doesn't repair attribute list linkage */ 00455 void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); 00456 00457 /* detach attribute from node */ 00458 void TY_(DetachAttribute)( Node *node, AttVal *attr ); 00459 00460 /* detach attribute from node then free it 00461 */ 00462 void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); 00463 00464 /* 00465 Free document nodes by iterating through peers and recursing 00466 through children. Set next to NULL before calling FreeNode() 00467 to avoid freeing peer nodes. Doesn't patch up prev/next links. 00468 */ 00469 void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); 00470 00471 Node* TY_(TextToken)( Lexer *lexer ); 00472 00473 /* used for creating preformatted text from Word2000 */ 00474 Node* TY_(NewLineNode)( Lexer *lexer ); 00475 00476 /* used for adding a for Word2000 */ 00477 Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); 00478 00479 void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); 00480 /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */ 00481 00482 /* find element */ 00483 Node* TY_(FindDocType)( TidyDocImpl* doc ); 00484 Node* TY_(FindHTML)( TidyDocImpl* doc ); 00485 Node* TY_(FindHEAD)( TidyDocImpl* doc ); 00486 Node* TY_(FindTITLE)(TidyDocImpl* doc); 00487 Node* TY_(FindBody)( TidyDocImpl* doc ); 00488 Node* TY_(FindXmlDecl)(TidyDocImpl* doc); 00489 00490 /* Returns containing block element, if any */ 00491 Node* TY_(FindContainer)( Node* node ); 00492 00493 /* add meta element for Tidy */ 00494 Bool TY_(AddGenerator)( TidyDocImpl* doc ); 00495 00496 uint TY_(ApparentVersion)( TidyDocImpl* doc ); 00497 00498 ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml ); 00499 00500 Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ); 00501 00502 Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); 00503 00504 00505 /* fixup doctype if missing */ 00506 Bool TY_(FixDocType)( TidyDocImpl* doc ); 00507 00508 /* ensure XML document starts with <?xml version="1.0"?> */ 00509 /* add encoding attribute if not using ASCII or UTF-8 output */ 00510 Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); 00511 00512 Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); 00513 00514 void TY_(UngetToken)( TidyDocImpl* doc ); 00515 00516 00517 /* 00518 modes for GetToken() 00519 00520 MixedContent -- for elements which don't accept PCDATA 00521 Preformatted -- white space preserved as is 00522 IgnoreMarkup -- for CDATA elements such as script, style 00523 */ 00524 typedef enum 00525 { 00526 IgnoreWhitespace, 00527 MixedContent, 00528 Preformatted, 00529 IgnoreMarkup, 00530 CdataContent 00531 } GetTokenMode; 00532 00533 Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); 00534 00535 void TY_(InitMap)(void); 00536 00537 00538 /* create a new attribute */ 00539 AttVal* TY_(NewAttribute)( TidyDocImpl* doc ); 00540 00541 /* create a new attribute with given name and value */ 00542 AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 00543 int delim ); 00544 00545 /* insert attribute at the end of attribute list of a node */ 00546 void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); 00547 00548 /* insert attribute at the start of attribute list of a node */ 00549 void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); 00550 00551 /************************************* 00552 In-line Stack functions 00553 *************************************/ 00554 00555 00556 /* duplicate attributes */ 00557 AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); 00558 00559 /* 00560 push a copy of an inline node onto stack 00561 but don't push if implicit or OBJECT or APPLET 00562 (implicit tags are ones generated from the istack) 00563 00564 One issue arises with pushing inlines when 00565 the tag is already pushed. For instance: 00566 00567 <p><em>text 00568 <p><em>more text 00569 00570 Shouldn't be mapped to 00571 00572 <p><em>text</em></p> 00573 <p><em><em>more text</em></em> 00574 */ 00575 void TY_(PushInline)( TidyDocImpl* doc, Node* node ); 00576 00577 /* pop inline stack */ 00578 void TY_(PopInline)( TidyDocImpl* doc, Node* node ); 00579 00580 Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); 00581 Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); 00582 00583 /* 00584 This has the effect of inserting "missing" inline 00585 elements around the contents of blocklevel elements 00586 such as P, TD, TH, DIV, PRE etc. This procedure is 00587 called at the start of ParseBlock. when the inline 00588 stack is not empty, as will be the case in: 00589 00590 <i><h1>italic heading</h1></i> 00591 00592 which is then treated as equivalent to 00593 00594 <h1><i>italic heading</i></h1> 00595 00596 This is implemented by setting the lexer into a mode 00597 where it gets tokens from the inline stack rather than 00598 from the input stream. 00599 */ 00600 int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); 00601 00602 /* 00603 defer duplicates when entering a table or other 00604 element where the inlines shouldn't be duplicated 00605 */ 00606 void TY_(DeferDup)( TidyDocImpl* doc ); 00607 Node* TY_(InsertedToken)( TidyDocImpl* doc ); 00608 00609 /* stack manipulation for inline elements */ 00610 Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node ); 00611 Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element ); 00612 00613 #ifdef __cplusplus 00614 } 00615 #endif 00616 00617 00618 #endif /* __LEXER_H__ */