HTML Tidy  0.1
streamio.h
00001 #ifndef __STREAMIO_H__
00002 #define __STREAMIO_H__
00003 
00004 /* streamio.h -- handles character stream I/O
00005 
00006   (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
00007   See tidy.h for the copyright notice.
00008 
00009   CVS Info :
00010 
00011     $Author: arnaud02 $ 
00012     $Date: 2006/12/29 16:31:08 $ 
00013     $Revision: 1.18 $ 
00014 
00015   Wrapper around Tidy input source and output sink
00016   that calls appropriate interfaces, and applies 
00017   necessary char encoding transformations: to/from
00018   ISO-10646 and/or UTF-8.
00019 
00020 */
00021 
00022 #include "forward.h"
00023 #include "buffio.h"
00024 #include "fileio.h"
00025 
00026 #ifdef __cplusplus
00027 extern "C"
00028 {
00029 #endif
00030 typedef enum
00031 {
00032   FileIO,
00033   BufferIO,
00034   UserIO
00035 } IOType;
00036 
00037 /* states for ISO 2022
00038 
00039  A document in ISO-2022 based encoding uses some ESC sequences called
00040  "designator" to switch character sets. The designators defined and
00041  used in ISO-2022-JP are:
00042 
00043     "ESC" + "(" + ?     for ISO646 variants
00044 
00045     "ESC" + "$" + ?     and
00046     "ESC" + "$" + "(" + ?   for multibyte character sets
00047 */
00048 typedef enum
00049 {
00050   FSM_ASCII,
00051   FSM_ESC,
00052   FSM_ESCD,
00053   FSM_ESCDP,
00054   FSM_ESCP,
00055   FSM_NONASCII
00056 } ISO2022State;
00057 
00058 /************************
00059 ** Source
00060 ************************/
00061 
00062 #define CHARBUF_SIZE 5
00063 
00064 /* non-raw input is cleaned up*/
00065 struct _StreamIn
00066 {
00067     ISO2022State    state;     /* FSM for ISO2022 */
00068     Bool   pushed;
00069     TidyAllocator *allocator;
00070     tchar* charbuf;
00071     uint   bufpos;
00072     uint   bufsize;
00073     int    tabs;
00074     int    lastcol;
00075     int    curcol;
00076     int    curline;
00077     int    encoding;
00078     IOType iotype;
00079 
00080     TidyInputSource source;
00081 
00082 #ifdef TIDY_WIN32_MLANG_SUPPORT
00083     ulong  mlang;
00084 #endif
00085 
00086 #ifdef TIDY_STORE_ORIGINAL_TEXT
00087     tmbstr otextbuf;
00088     size_t otextsize;
00089     uint   otextlen;
00090 #endif
00091 
00092     /* Pointer back to document for error reporting */
00093     TidyDocImpl* doc;
00094 };
00095 
00096 StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding );
00097 void TY_(freeStreamIn)(StreamIn* in);
00098 
00099 StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding );
00100 StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding );
00101 StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding );
00102 
00103 int       TY_(ReadBOMEncoding)(StreamIn *in);
00104 uint      TY_(ReadChar)( StreamIn* in );
00105 void      TY_(UngetChar)( uint c, StreamIn* in );
00106 Bool      TY_(IsEOF)( StreamIn* in );
00107 
00108 
00109 /************************
00110 ** Sink
00111 ************************/
00112 
00113 struct _StreamOut
00114 {
00115     int   encoding;
00116     ISO2022State   state;     /* for ISO 2022 */
00117     uint  nl;
00118 
00119 #ifdef TIDY_WIN32_MLANG_SUPPORT
00120     ulong mlang;
00121 #endif
00122 
00123     IOType iotype;
00124     TidyOutputSink sink;
00125 };
00126 
00127 StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint newln );
00128 StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint newln );
00129 StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint newln );
00130 
00131 StreamOut* TY_(StdErrOutput)(void);
00132 /* StreamOut* StdOutOutput(void); */
00133 void       TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out );
00134 
00135 void TY_(WriteChar)( uint c, StreamOut* out );
00136 void TY_(outBOM)( StreamOut *out );
00137 
00138 ctmbstr TY_(GetEncodingNameFromTidyId)(uint id);
00139 ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id);
00140 int TY_(GetCharEncodingFromOptName)(ctmbstr charenc);
00141 
00142 /************************
00143 ** Misc
00144 ************************/
00145 
00146 /* character encodings
00147 */
00148 #define RAW         0
00149 #define ASCII       1
00150 #define LATIN0      2
00151 #define LATIN1      3
00152 #define UTF8        4
00153 #define ISO2022     5
00154 #define MACROMAN    6
00155 #define WIN1252     7
00156 #define IBM858      8
00157 
00158 #if SUPPORT_UTF16_ENCODINGS
00159 #define UTF16LE     9
00160 #define UTF16BE     10
00161 #define UTF16       11
00162 #endif
00163 
00164 /* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
00165 ** (i.e., to Unicode) before being recoded into UTF-8. This may be
00166 ** confusing: usually UTF-8 implies ISO10646 codepoints.
00167 */
00168 #if SUPPORT_ASIAN_ENCODINGS
00169 #if SUPPORT_UTF16_ENCODINGS
00170 #define BIG5        12
00171 #define SHIFTJIS    13
00172 #else
00173 #define BIG5        9
00174 #define SHIFTJIS    10
00175 #endif
00176 #endif
00177 
00178 #ifdef TIDY_WIN32_MLANG_SUPPORT
00179 /* hack: windows code page numbers start at 37 */
00180 #define WIN32MLANG  36
00181 #endif
00182 
00183 
00184 /* char encoding used when replacing illegal SGML chars,
00185 ** regardless of specified encoding.  Set at compile time
00186 ** to either Windows or Mac.
00187 */
00188 extern const int TY_(ReplacementCharEncoding);
00189 
00190 /* Function for conversion from Windows-1252 to Unicode */
00191 uint TY_(DecodeWin1252)(uint c);
00192 
00193 /* Function to convert from MacRoman to Unicode */
00194 uint TY_(DecodeMacRoman)(uint c);
00195 
00196 #ifdef __cplusplus
00197 }
00198 #endif
00199 
00200 
00201 /* Use numeric constants as opposed to escape chars (\r, \n)
00202 ** to avoid conflict Mac compilers that may re-define these.
00203 */
00204 #define CR    0xD
00205 #define LF    0xA
00206 
00207 #if   defined(MAC_OS_CLASSIC)
00208 #define DEFAULT_NL_CONFIG TidyCR
00209 #elif defined(_WIN32) || defined(OS2_OS)
00210 #define DEFAULT_NL_CONFIG TidyCRLF
00211 #else
00212 #define DEFAULT_NL_CONFIG TidyLF
00213 #endif
00214 
00215 
00216 #endif /* __STREAMIO_H__ */