HTML Tidy
0.1
|
00001 #ifndef __STREAMIO_H__ 00002 #define __STREAMIO_H__ 00003 00004 /* streamio.h -- handles character stream I/O 00005 00006 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 00007 See tidy.h for the copyright notice. 00008 00009 CVS Info : 00010 00011 $Author: arnaud02 $ 00012 $Date: 2006/12/29 16:31:08 $ 00013 $Revision: 1.18 $ 00014 00015 Wrapper around Tidy input source and output sink 00016 that calls appropriate interfaces, and applies 00017 necessary char encoding transformations: to/from 00018 ISO-10646 and/or UTF-8. 00019 00020 */ 00021 00022 #include "forward.h" 00023 #include "buffio.h" 00024 #include "fileio.h" 00025 00026 #ifdef __cplusplus 00027 extern "C" 00028 { 00029 #endif 00030 typedef enum 00031 { 00032 FileIO, 00033 BufferIO, 00034 UserIO 00035 } IOType; 00036 00037 /* states for ISO 2022 00038 00039 A document in ISO-2022 based encoding uses some ESC sequences called 00040 "designator" to switch character sets. The designators defined and 00041 used in ISO-2022-JP are: 00042 00043 "ESC" + "(" + ? for ISO646 variants 00044 00045 "ESC" + "$" + ? and 00046 "ESC" + "$" + "(" + ? for multibyte character sets 00047 */ 00048 typedef enum 00049 { 00050 FSM_ASCII, 00051 FSM_ESC, 00052 FSM_ESCD, 00053 FSM_ESCDP, 00054 FSM_ESCP, 00055 FSM_NONASCII 00056 } ISO2022State; 00057 00058 /************************ 00059 ** Source 00060 ************************/ 00061 00062 #define CHARBUF_SIZE 5 00063 00064 /* non-raw input is cleaned up*/ 00065 struct _StreamIn 00066 { 00067 ISO2022State state; /* FSM for ISO2022 */ 00068 Bool pushed; 00069 TidyAllocator *allocator; 00070 tchar* charbuf; 00071 uint bufpos; 00072 uint bufsize; 00073 int tabs; 00074 int lastcol; 00075 int curcol; 00076 int curline; 00077 int encoding; 00078 IOType iotype; 00079 00080 TidyInputSource source; 00081 00082 #ifdef TIDY_WIN32_MLANG_SUPPORT 00083 ulong mlang; 00084 #endif 00085 00086 #ifdef TIDY_STORE_ORIGINAL_TEXT 00087 tmbstr otextbuf; 00088 size_t otextsize; 00089 uint otextlen; 00090 #endif 00091 00092 /* Pointer back to document for error reporting */ 00093 TidyDocImpl* doc; 00094 }; 00095 00096 StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding ); 00097 void TY_(freeStreamIn)(StreamIn* in); 00098 00099 StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding ); 00100 StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding ); 00101 StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding ); 00102 00103 int TY_(ReadBOMEncoding)(StreamIn *in); 00104 uint TY_(ReadChar)( StreamIn* in ); 00105 void TY_(UngetChar)( uint c, StreamIn* in ); 00106 Bool TY_(IsEOF)( StreamIn* in ); 00107 00108 00109 /************************ 00110 ** Sink 00111 ************************/ 00112 00113 struct _StreamOut 00114 { 00115 int encoding; 00116 ISO2022State state; /* for ISO 2022 */ 00117 uint nl; 00118 00119 #ifdef TIDY_WIN32_MLANG_SUPPORT 00120 ulong mlang; 00121 #endif 00122 00123 IOType iotype; 00124 TidyOutputSink sink; 00125 }; 00126 00127 StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint newln ); 00128 StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint newln ); 00129 StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint newln ); 00130 00131 StreamOut* TY_(StdErrOutput)(void); 00132 /* StreamOut* StdOutOutput(void); */ 00133 void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out ); 00134 00135 void TY_(WriteChar)( uint c, StreamOut* out ); 00136 void TY_(outBOM)( StreamOut *out ); 00137 00138 ctmbstr TY_(GetEncodingNameFromTidyId)(uint id); 00139 ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id); 00140 int TY_(GetCharEncodingFromOptName)(ctmbstr charenc); 00141 00142 /************************ 00143 ** Misc 00144 ************************/ 00145 00146 /* character encodings 00147 */ 00148 #define RAW 0 00149 #define ASCII 1 00150 #define LATIN0 2 00151 #define LATIN1 3 00152 #define UTF8 4 00153 #define ISO2022 5 00154 #define MACROMAN 6 00155 #define WIN1252 7 00156 #define IBM858 8 00157 00158 #if SUPPORT_UTF16_ENCODINGS 00159 #define UTF16LE 9 00160 #define UTF16BE 10 00161 #define UTF16 11 00162 #endif 00163 00164 /* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints 00165 ** (i.e., to Unicode) before being recoded into UTF-8. This may be 00166 ** confusing: usually UTF-8 implies ISO10646 codepoints. 00167 */ 00168 #if SUPPORT_ASIAN_ENCODINGS 00169 #if SUPPORT_UTF16_ENCODINGS 00170 #define BIG5 12 00171 #define SHIFTJIS 13 00172 #else 00173 #define BIG5 9 00174 #define SHIFTJIS 10 00175 #endif 00176 #endif 00177 00178 #ifdef TIDY_WIN32_MLANG_SUPPORT 00179 /* hack: windows code page numbers start at 37 */ 00180 #define WIN32MLANG 36 00181 #endif 00182 00183 00184 /* char encoding used when replacing illegal SGML chars, 00185 ** regardless of specified encoding. Set at compile time 00186 ** to either Windows or Mac. 00187 */ 00188 extern const int TY_(ReplacementCharEncoding); 00189 00190 /* Function for conversion from Windows-1252 to Unicode */ 00191 uint TY_(DecodeWin1252)(uint c); 00192 00193 /* Function to convert from MacRoman to Unicode */ 00194 uint TY_(DecodeMacRoman)(uint c); 00195 00196 #ifdef __cplusplus 00197 } 00198 #endif 00199 00200 00201 /* Use numeric constants as opposed to escape chars (\r, \n) 00202 ** to avoid conflict Mac compilers that may re-define these. 00203 */ 00204 #define CR 0xD 00205 #define LF 0xA 00206 00207 #if defined(MAC_OS_CLASSIC) 00208 #define DEFAULT_NL_CONFIG TidyCR 00209 #elif defined(_WIN32) || defined(OS2_OS) 00210 #define DEFAULT_NL_CONFIG TidyCRLF 00211 #else 00212 #define DEFAULT_NL_CONFIG TidyLF 00213 #endif 00214 00215 00216 #endif /* __STREAMIO_H__ */