HTML Tidy  0.1
utf8.h
00001 #ifndef __UTF8_H__
00002 #define __UTF8_H__
00003 
00004 /* utf8.h -- convert characters to/from UTF-8
00005 
00006   (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
00007   See tidy.h for the copyright notice.
00008 
00009   CVS Info :
00010 
00011     $Author: arnaud02 $ 
00012     $Date: 2006/09/12 15:14:44 $ 
00013     $Revision: 1.5 $ 
00014 
00015 */
00016 
00017 #include "platform.h"
00018 #include "buffio.h"
00019 
00020 /* UTF-8 encoding/decoding support
00021 ** Does not convert character "codepoints", i.e. to/from 10646.
00022 */
00023 
00024 int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
00025                                 TidyInputSource* inp, int* count );
00026 
00027 int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
00028                                 TidyOutputSink* outp, int* count );
00029 
00030 
00031 uint  TY_(GetUTF8)( ctmbstr str, uint *ch );
00032 tmbstr TY_(PutUTF8)( tmbstr buf, uint c );
00033 
00034 #define UNICODE_BOM_BE   0xFEFF   /* big-endian (default) UNICODE BOM */
00035 #define UNICODE_BOM      UNICODE_BOM_BE
00036 #define UNICODE_BOM_LE   0xFFFE   /* little-endian UNICODE BOM */
00037 #define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */
00038 
00039 
00040 Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 );
00041 Bool    TY_(IsHighSurrogate)( tchar ch );
00042 Bool    TY_(IsLowSurrogate)( tchar ch );
00043 
00044 Bool    TY_(IsCombinedChar)( tchar ch );
00045 Bool    TY_(IsValidCombinedChar)( tchar ch );
00046 
00047 tchar   TY_(CombineSurrogatePair)( tchar high, tchar low );
00048 Bool    TY_(SplitSurrogatePair)( tchar utf16, tchar* high, tchar* low );
00049 
00050 
00051 
00052 #endif /* __UTF8_H__ */