Ruby  1.9.3p448(2013-06-27revision41675)
string.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: usa $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019 
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022 
00023 #include <math.h>
00024 #include <ctype.h>
00025 
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029 
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031 
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048 
00049 static VALUE rb_str_clear(VALUE str);
00050 
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053 
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00058 #define STR_ASSOC   FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066 
00067 
00068 #define STR_SET_NOEMBED(str) do {\
00069     FL_SET((str), STR_NOEMBED);\
00070     STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075     long tmp_n = (n);\
00076     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079 
00080 #define STR_SET_LEN(str, n) do { \
00081     if (STR_EMBED_P(str)) {\
00082         STR_SET_EMBED_LEN((str), (n));\
00083     }\
00084     else {\
00085         RSTRING(str)->as.heap.len = (n);\
00086     }\
00087 } while (0)
00088 
00089 #define STR_DEC_LEN(str) do {\
00090     if (STR_EMBED_P(str)) {\
00091         long n = RSTRING_LEN(str);\
00092         n--;\
00093         STR_SET_EMBED_LEN((str), n);\
00094     }\
00095     else {\
00096         RSTRING(str)->as.heap.len--;\
00097     }\
00098 } while (0)
00099 
00100 #define RESIZE_CAPA(str,capacity) do {\
00101     if (STR_EMBED_P(str)) {\
00102         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103             char *tmp = ALLOC_N(char, (capacity)+1);\
00104             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105             RSTRING(str)->as.heap.ptr = tmp;\
00106             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107             STR_SET_NOEMBED(str);\
00108             RSTRING(str)->as.heap.aux.capa = (capacity);\
00109         }\
00110     }\
00111     else {\
00112         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113         if (!STR_NOCAPA_P(str))\
00114             RSTRING(str)->as.heap.aux.capa = (capacity);\
00115     }\
00116 } while (0)
00117 
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120 
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122 
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126     rb_encoding *enc;
00127 
00128     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00129     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130         return 1;
00131 
00132     enc = STR_ENC_GET(str);
00133     if (rb_enc_mbmaxlen(enc) == 1)
00134         return 1;
00135 
00136     /* Conservative.  Possibly single byte.
00137      * "\xa1" in Shift_JIS for example. */
00138     return 0;
00139 }
00140 
00141 VALUE rb_fs;
00142 
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152     if ((int)sizeof(VALUE) * 2 < e - p) {
00153         const VALUE *s, *t;
00154         const VALUE lowbits = sizeof(VALUE) - 1;
00155         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156         while (p < (const char *)s) {
00157             if (!ISASCII(*p))
00158                 return p;
00159             p++;
00160         }
00161         t = (const VALUE*)(~lowbits & (VALUE)e);
00162         while (s < t) {
00163             if (*s & NONASCII_MASK) {
00164                 t = s;
00165                 break;
00166             }
00167             s++;
00168         }
00169         p = (const char *)t;
00170     }
00171 #endif
00172     while (p < e) {
00173         if (!ISASCII(*p))
00174             return p;
00175         p++;
00176     }
00177     return NULL;
00178 }
00179 
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183     const char *e = p + len;
00184 
00185     if (rb_enc_to_index(enc) == 0) {
00186         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00187         p = search_nonascii(p, e);
00188         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189     }
00190 
00191     if (rb_enc_asciicompat(enc)) {
00192         p = search_nonascii(p, e);
00193         if (!p) {
00194             return ENC_CODERANGE_7BIT;
00195         }
00196         while (p < e) {
00197             int ret = rb_enc_precise_mbclen(p, e, enc);
00198             if (!MBCLEN_CHARFOUND_P(ret)) {
00199                 return ENC_CODERANGE_BROKEN;
00200             }
00201             p += MBCLEN_CHARFOUND_LEN(ret);
00202             if (p < e) {
00203                 p = search_nonascii(p, e);
00204                 if (!p) {
00205                     return ENC_CODERANGE_VALID;
00206                 }
00207             }
00208         }
00209         if (e < p) {
00210             return ENC_CODERANGE_BROKEN;
00211         }
00212         return ENC_CODERANGE_VALID;
00213     }
00214 
00215     while (p < e) {
00216         int ret = rb_enc_precise_mbclen(p, e, enc);
00217 
00218         if (!MBCLEN_CHARFOUND_P(ret)) {
00219             return ENC_CODERANGE_BROKEN;
00220         }
00221         p += MBCLEN_CHARFOUND_LEN(ret);
00222     }
00223     if (e < p) {
00224         return ENC_CODERANGE_BROKEN;
00225     }
00226     return ENC_CODERANGE_VALID;
00227 }
00228 
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232     const char *p = s;
00233 
00234     if (*cr == ENC_CODERANGE_BROKEN)
00235         return e - s;
00236 
00237     if (rb_enc_to_index(enc) == 0) {
00238         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00239         p = search_nonascii(p, e);
00240         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241         return e - s;
00242     }
00243     else if (rb_enc_asciicompat(enc)) {
00244         p = search_nonascii(p, e);
00245         if (!p) {
00246             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247             return e - s;
00248         }
00249         while (p < e) {
00250             int ret = rb_enc_precise_mbclen(p, e, enc);
00251             if (!MBCLEN_CHARFOUND_P(ret)) {
00252                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253                 return p - s;
00254             }
00255             p += MBCLEN_CHARFOUND_LEN(ret);
00256             if (p < e) {
00257                 p = search_nonascii(p, e);
00258                 if (!p) {
00259                     *cr = ENC_CODERANGE_VALID;
00260                     return e - s;
00261                 }
00262             }
00263         }
00264         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265         return p - s;
00266     }
00267     else {
00268         while (p < e) {
00269             int ret = rb_enc_precise_mbclen(p, e, enc);
00270             if (!MBCLEN_CHARFOUND_P(ret)) {
00271                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272                 return p - s;
00273             }
00274             p += MBCLEN_CHARFOUND_LEN(ret);
00275         }
00276         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277         return p - s;
00278     }
00279 }
00280 
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284     rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286 
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290     /* this function is designed for copying encoding and coderange
00291      * from src to new string "dest" which is made from the part of src.
00292      */
00293     str_enc_copy(dest, src);
00294     switch (ENC_CODERANGE(src)) {
00295       case ENC_CODERANGE_7BIT:
00296         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297         break;
00298       case ENC_CODERANGE_VALID:
00299         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302         else
00303             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304         break;
00305       default:
00306         if (RSTRING_LEN(dest) == 0) {
00307             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309             else
00310                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311         }
00312         break;
00313     }
00314 }
00315 
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319     str_enc_copy(dest, src);
00320     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322 
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326     int cr = ENC_CODERANGE(str);
00327 
00328     if (cr == ENC_CODERANGE_UNKNOWN) {
00329         rb_encoding *enc = STR_ENC_GET(str);
00330         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331         ENC_CODERANGE_SET(str, cr);
00332     }
00333     return cr;
00334 }
00335 
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339     rb_encoding *enc = STR_ENC_GET(str);
00340 
00341     if (!rb_enc_asciicompat(enc))
00342         return FALSE;
00343     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344         return TRUE;
00345     return FALSE;
00346 }
00347 
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352         rb_raise(rb_eRuntimeError, "string modified");
00353     }
00354 }
00355 
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359     if (STR_EMBED_P(str)) {
00360         return RSTRING_EMBED_LEN_MAX;
00361     }
00362     else if (STR_NOCAPA_P(str)) {
00363         return RSTRING(str)->as.heap.len;
00364     }
00365     else {
00366         return RSTRING(str)->as.heap.aux.capa;
00367     }
00368 }
00369 
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373     NEWOBJ(str, struct RString);
00374     OBJSETUP(str, klass, T_STRING);
00375 
00376     str->as.heap.ptr = 0;
00377     str->as.heap.len = 0;
00378     str->as.heap.aux.capa = 0;
00379 
00380     return (VALUE)str;
00381 }
00382 
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386     VALUE str;
00387 
00388     if (len < 0) {
00389         rb_raise(rb_eArgError, "negative string size (or size too big)");
00390     }
00391 
00392     str = str_alloc(klass);
00393     if (len > RSTRING_EMBED_LEN_MAX) {
00394         RSTRING(str)->as.heap.aux.capa = len;
00395         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396         STR_SET_NOEMBED(str);
00397     }
00398     else if (len == 0) {
00399         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400     }
00401     if (ptr) {
00402         memcpy(RSTRING_PTR(str), ptr, len);
00403     }
00404     STR_SET_LEN(str, len);
00405     RSTRING_PTR(str)[len] = '\0';
00406     return str;
00407 }
00408 
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412     return str_new(rb_cString, ptr, len);
00413 }
00414 
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418     VALUE str = rb_str_new(ptr, len);
00419     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420     return str;
00421 }
00422 
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426     VALUE str = rb_str_new(ptr, len);
00427     rb_enc_associate(str, enc);
00428     return str;
00429 }
00430 
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434     if (!ptr) {
00435         rb_raise(rb_eArgError, "NULL pointer given");
00436     }
00437     return rb_str_new(ptr, strlen(ptr));
00438 }
00439 
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442 
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446     VALUE str = rb_str_new2(ptr);
00447     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448     return str;
00449 }
00450 
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453 
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457     VALUE str = rb_str_new(ptr, len);
00458 
00459     OBJ_TAINT(str);
00460     return str;
00461 }
00462 
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466     VALUE str = rb_str_new2(ptr);
00467 
00468     OBJ_TAINT(str);
00469     return str;
00470 }
00471 
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474 
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478     rb_econv_t *ec;
00479     rb_econv_result_t ret;
00480     long len;
00481     VALUE newstr;
00482     const unsigned char *sp;
00483     unsigned char *dp;
00484 
00485     if (!to) return str;
00486     if (from == to) return str;
00487     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488         to == rb_ascii8bit_encoding()) {
00489         if (STR_ENC_GET(str) != to) {
00490             str = rb_str_dup(str);
00491             rb_enc_associate(str, to);
00492         }
00493         return str;
00494     }
00495 
00496     len = RSTRING_LEN(str);
00497     newstr = rb_str_new(0, len);
00498 
00499   retry:
00500     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501     if (!ec) return str;
00502 
00503     sp = (unsigned char*)RSTRING_PTR(str);
00504     dp = (unsigned char*)RSTRING_PTR(newstr);
00505     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507     rb_econv_close(ec);
00508     switch (ret) {
00509       case econv_destination_buffer_full:
00510         /* destination buffer short */
00511         len = len < 2 ? 2 : len * 2;
00512         rb_str_resize(newstr, len);
00513         goto retry;
00514 
00515       case econv_finished:
00516         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517         rb_str_set_len(newstr, len);
00518         rb_enc_associate(newstr, to);
00519         return newstr;
00520 
00521       default:
00522         /* some error, return original */
00523         return str;
00524     }
00525 }
00526 
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532 
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536     VALUE str;
00537 
00538     str = rb_tainted_str_new(ptr, len);
00539     if (eenc == rb_usascii_encoding() &&
00540         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541         rb_enc_associate(str, rb_ascii8bit_encoding());
00542         return str;
00543     }
00544     rb_enc_associate(str, eenc);
00545     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547 
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553 
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559 
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565 
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571 
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577 
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583 
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589 
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595 
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601 
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606         STR_SET_EMBED(str2);
00607         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609     }
00610     else {
00611         str = rb_str_new_frozen(str);
00612         FL_SET(str2, STR_NOEMBED);
00613         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615         RSTRING(str2)->as.heap.aux.shared = str;
00616         FL_SET(str2, ELTS_SHARED);
00617     }
00618     rb_enc_cr_str_exact_copy(str2, str);
00619 
00620     return str2;
00621 }
00622 
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626     return str_replace_shared(str_alloc(klass), str);
00627 }
00628 
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632     return str_new_shared(klass, str);
00633 }
00634 
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638     VALUE str2 = str_new3(rb_obj_class(str), str);
00639 
00640     OBJ_INFECT(str2, str);
00641     return str2;
00642 }
00643 
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646 
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650     VALUE str2;
00651 
00652     str2 = str_alloc(klass);
00653     STR_SET_NOEMBED(str2);
00654     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656     if (STR_SHARED_P(str)) {
00657         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658         assert(OBJ_FROZEN(shared));
00659         FL_SET(str2, ELTS_SHARED);
00660         RSTRING(str2)->as.heap.aux.shared = shared;
00661     }
00662     else {
00663         FL_SET(str, ELTS_SHARED);
00664         RSTRING(str)->as.heap.aux.shared = str2;
00665     }
00666     rb_enc_cr_str_exact_copy(str2, str);
00667     OBJ_INFECT(str2, str);
00668     return str2;
00669 }
00670 
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674     VALUE klass, str;
00675 
00676     if (OBJ_FROZEN(orig)) return orig;
00677     klass = rb_obj_class(orig);
00678     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679         long ofs;
00680         assert(OBJ_FROZEN(str));
00681         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684             ENCODING_GET(str) != ENCODING_GET(orig)) {
00685             str = str_new3(klass, str);
00686             RSTRING(str)->as.heap.ptr += ofs;
00687             RSTRING(str)->as.heap.len -= ofs;
00688             rb_enc_cr_str_exact_copy(str, orig);
00689             OBJ_INFECT(str, orig);
00690         }
00691     }
00692     else if (STR_EMBED_P(orig)) {
00693         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694         rb_enc_cr_str_exact_copy(str, orig);
00695         OBJ_INFECT(str, orig);
00696     }
00697     else if (STR_ASSOC_P(orig)) {
00698         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699         FL_UNSET(orig, STR_ASSOC);
00700         str = str_new4(klass, orig);
00701         FL_SET(str, STR_ASSOC);
00702         RSTRING(str)->as.heap.aux.shared = assoc;
00703     }
00704     else {
00705         str = str_new4(klass, orig);
00706     }
00707     OBJ_FREEZE(str);
00708     return str;
00709 }
00710 
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713 
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717     return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719 
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721            rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723 
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727     VALUE v = rb_str_new5(str, 0, 0);
00728     rb_enc_copy(v, str);
00729     OBJ_INFECT(v, str);
00730     return v;
00731 }
00732 
00733 #define STR_BUF_MIN_SIZE 128
00734 
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738     VALUE str = str_alloc(rb_cString);
00739 
00740     if (capa < STR_BUF_MIN_SIZE) {
00741         capa = STR_BUF_MIN_SIZE;
00742     }
00743     FL_SET(str, STR_NOEMBED);
00744     RSTRING(str)->as.heap.aux.capa = capa;
00745     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746     RSTRING(str)->as.heap.ptr[0] = '\0';
00747 
00748     return str;
00749 }
00750 
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754     VALUE str;
00755     long len = strlen(ptr);
00756 
00757     str = rb_str_buf_new(len);
00758     rb_str_buf_cat(str, ptr, len);
00759 
00760     return str;
00761 }
00762 
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765 
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769     return str_new(0, 0, len);
00770 }
00771 
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775     VALUE s = rb_str_tmp_new(len);
00776     *store = s;
00777     return RSTRING_PTR(s);
00778 }
00779 
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783     VALUE s = *store;
00784     *store = 0;
00785     if (s) rb_str_clear(s);
00786 }
00787 
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792         xfree(RSTRING(str)->as.heap.ptr);
00793     }
00794 }
00795 
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800         return RSTRING(str)->as.heap.aux.capa;
00801     }
00802     else {
00803         return 0;
00804     }
00805 }
00806 
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810     return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812 
00813 static inline void str_discard(VALUE str);
00814 
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818     rb_encoding *enc;
00819     int cr;
00820     if (str == str2) return;
00821     enc = STR_ENC_GET(str2);
00822     cr = ENC_CODERANGE(str2);
00823     str_discard(str);
00824     OBJ_INFECT(str, str2);
00825     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826         STR_SET_EMBED(str);
00827         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829         rb_enc_associate(str, enc);
00830         ENC_CODERANGE_SET(str, cr);
00831         return;
00832     }
00833     STR_SET_NOEMBED(str);
00834     STR_UNSET_NOCAPA(str);
00835     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837     if (STR_NOCAPA_P(str2)) {
00838         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840     }
00841     else {
00842         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843     }
00844     STR_SET_EMBED(str2);        /* abandon str2 */
00845     RSTRING_PTR(str2)[0] = 0;
00846     STR_SET_EMBED_LEN(str2, 0);
00847     rb_enc_associate(str, enc);
00848     ENC_CODERANGE_SET(str, cr);
00849 }
00850 
00851 static ID id_to_s;
00852 
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856     VALUE str;
00857 
00858     if (TYPE(obj) == T_STRING) {
00859         return obj;
00860     }
00861     str = rb_funcall(obj, id_to_s, 0);
00862     if (TYPE(str) != T_STRING)
00863         return rb_any_to_s(obj);
00864     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865     return str;
00866 }
00867 
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871     long len;
00872 
00873     len = RSTRING_LEN(str2);
00874     if (STR_ASSOC_P(str2)) {
00875         str2 = rb_str_new4(str2);
00876     }
00877     if (STR_SHARED_P(str2)) {
00878         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879         assert(OBJ_FROZEN(shared));
00880         STR_SET_NOEMBED(str);
00881         RSTRING(str)->as.heap.len = len;
00882         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883         FL_SET(str, ELTS_SHARED);
00884         FL_UNSET(str, STR_ASSOC);
00885         RSTRING(str)->as.heap.aux.shared = shared;
00886     }
00887     else {
00888         str_replace_shared(str, str2);
00889     }
00890 
00891     OBJ_INFECT(str, str2);
00892     rb_enc_cr_str_exact_copy(str, str2);
00893     return str;
00894 }
00895 
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899     VALUE dup = str_alloc(klass);
00900     str_replace(dup, str);
00901     return dup;
00902 }
00903 
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907     return str_duplicate(rb_obj_class(str), str);
00908 }
00909 
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913     return str_replace(str_alloc(rb_cString), str);
00914 }
00915 
00916 /*
00917  *  call-seq:
00918  *     String.new(str="")   -> new_str
00919  *
00920  *  Returns a new string object containing a copy of <i>str</i>.
00921  */
00922 
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926     VALUE orig;
00927 
00928     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929         rb_str_replace(str, orig);
00930     return str;
00931 }
00932 
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936     long c;
00937     const char *q;
00938 
00939     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941     }
00942     else if (rb_enc_asciicompat(enc)) {
00943         c = 0;
00944         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945             while (p < e) {
00946                 if (ISASCII(*p)) {
00947                     q = search_nonascii(p, e);
00948                     if (!q)
00949                         return c + (e - p);
00950                     c += q - p;
00951                     p = q;
00952                 }
00953                 p += rb_enc_fast_mbclen(p, e, enc);
00954                 c++;
00955             }
00956         }
00957         else {
00958             while (p < e) {
00959                 if (ISASCII(*p)) {
00960                     q = search_nonascii(p, e);
00961                     if (!q)
00962                         return c + (e - p);
00963                     c += q - p;
00964                     p = q;
00965                 }
00966                 p += rb_enc_mbclen(p, e, enc);
00967                 c++;
00968             }
00969         }
00970         return c;
00971     }
00972 
00973     for (c=0; p<e; c++) {
00974         p += rb_enc_mbclen(p, e, enc);
00975     }
00976     return c;
00977 }
00978 
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984 
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988     long c;
00989     const char *q;
00990     int ret;
00991 
00992     *cr = 0;
00993     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995     }
00996     else if (rb_enc_asciicompat(enc)) {
00997         c = 0;
00998         while (p < e) {
00999             if (ISASCII(*p)) {
01000                 q = search_nonascii(p, e);
01001                 if (!q) {
01002                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003                     return c + (e - p);
01004                 }
01005                 c += q - p;
01006                 p = q;
01007             }
01008             ret = rb_enc_precise_mbclen(p, e, enc);
01009             if (MBCLEN_CHARFOUND_P(ret)) {
01010                 *cr |= ENC_CODERANGE_VALID;
01011                 p += MBCLEN_CHARFOUND_LEN(ret);
01012             }
01013             else {
01014                 *cr = ENC_CODERANGE_BROKEN;
01015                 p++;
01016             }
01017             c++;
01018         }
01019         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020         return c;
01021     }
01022 
01023     for (c=0; p<e; c++) {
01024         ret = rb_enc_precise_mbclen(p, e, enc);
01025         if (MBCLEN_CHARFOUND_P(ret)) {
01026             *cr |= ENC_CODERANGE_VALID;
01027             p += MBCLEN_CHARFOUND_LEN(ret);
01028         }
01029         else {
01030             *cr = ENC_CODERANGE_BROKEN;
01031             if (p + rb_enc_mbminlen(enc) <= e)
01032                 p += rb_enc_mbminlen(enc);
01033             else
01034                 p = e;
01035         }
01036     }
01037     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038     return c;
01039 }
01040 
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043 
01044 /*
01045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
01046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
01047  * Therefore, following pseudo code can detect UTF-8 leading byte.
01048  *
01049  * if (!(byte & 0x80))
01050  *   byte |= 0x40;          // turn on bit6
01051  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
01052  *
01053  * This function calculate every bytes in the argument word `s'
01054  * using the above logic concurrently. and gather every bytes result.
01055  */
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059     VALUE d = *s;
01060 
01061     /* Transform into bit0 represent UTF-8 leading or not. */
01062     d |= ~(d>>1);
01063     d >>= 6;
01064     d &= NONASCII_MASK >> 7;
01065 
01066     /* Gather every bytes. */
01067     d += (d>>8);
01068     d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070     d += (d>>32);
01071 #endif
01072     return (d&0xF);
01073 }
01074 #endif
01075 
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079     const char *p, *e;
01080     long n;
01081     int cr;
01082 
01083     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084     if (!enc) enc = STR_ENC_GET(str);
01085     p = RSTRING_PTR(str);
01086     e = RSTRING_END(str);
01087     cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090         enc == rb_utf8_encoding()) {
01091 
01092         VALUE len = 0;
01093         if ((int)sizeof(VALUE) * 2 < e - p) {
01094             const VALUE *s, *t;
01095             const VALUE lowbits = sizeof(VALUE) - 1;
01096             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097             t = (const VALUE*)(~lowbits & (VALUE)e);
01098             while (p < (const char *)s) {
01099                 if (is_utf8_lead_byte(*p)) len++;
01100                 p++;
01101             }
01102             while (s < t) {
01103                 len += count_utf8_lead_bytes_with_word(s);
01104                 s++;
01105             }
01106             p = (const char *)s;
01107         }
01108         while (p < e) {
01109             if (is_utf8_lead_byte(*p)) len++;
01110             p++;
01111         }
01112         return (long)len;
01113     }
01114 #endif
01115     n = rb_enc_strlen_cr(p, e, enc, &cr);
01116     if (cr) {
01117         ENC_CODERANGE_SET(str, cr);
01118     }
01119     return n;
01120 }
01121 
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125     return str_strlen(str, STR_ENC_GET(str));
01126 }
01127 
01128 /*
01129  *  call-seq:
01130  *     str.length   -> integer
01131  *     str.size     -> integer
01132  *
01133  *  Returns the character length of <i>str</i>.
01134  */
01135 
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139     long len;
01140 
01141     len = str_strlen(str, STR_ENC_GET(str));
01142     return LONG2NUM(len);
01143 }
01144 
01145 /*
01146  *  call-seq:
01147  *     str.bytesize  -> integer
01148  *
01149  *  Returns the length of <i>str</i> in bytes.
01150  */
01151 
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155     return LONG2NUM(RSTRING_LEN(str));
01156 }
01157 
01158 /*
01159  *  call-seq:
01160  *     str.empty?   -> true or false
01161  *
01162  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01163  *
01164  *     "hello".empty?   #=> false
01165  *     "".empty?        #=> true
01166  */
01167 
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171     if (RSTRING_LEN(str) == 0)
01172         return Qtrue;
01173     return Qfalse;
01174 }
01175 
01176 /*
01177  *  call-seq:
01178  *     str + other_str   -> new_str
01179  *
01180  *  Concatenation---Returns a new <code>String</code> containing
01181  *  <i>other_str</i> concatenated to <i>str</i>.
01182  *
01183  *     "Hello from " + self.to_s   #=> "Hello from main"
01184  */
01185 
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189     VALUE str3;
01190     rb_encoding *enc;
01191 
01192     StringValue(str2);
01193     enc = rb_enc_check(str1, str2);
01194     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197            RSTRING_PTR(str2), RSTRING_LEN(str2));
01198     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199 
01200     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201         OBJ_TAINT(str3);
01202     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204     return str3;
01205 }
01206 
01207 /*
01208  *  call-seq:
01209  *     str * integer   -> new_str
01210  *
01211  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01212  *  the receiver.
01213  *
01214  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01215  */
01216 
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220     VALUE str2;
01221     long n, len;
01222     char *ptr2;
01223 
01224     len = NUM2LONG(times);
01225     if (len < 0) {
01226         rb_raise(rb_eArgError, "negative argument");
01227     }
01228     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01229         rb_raise(rb_eArgError, "argument too big");
01230     }
01231 
01232     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233     ptr2 = RSTRING_PTR(str2);
01234     if (len) {
01235         n = RSTRING_LEN(str);
01236         memcpy(ptr2, RSTRING_PTR(str), n);
01237         while (n <= len/2) {
01238             memcpy(ptr2 + n, ptr2, n);
01239             n *= 2;
01240         }
01241         memcpy(ptr2 + n, ptr2, len-n);
01242     }
01243     ptr2[RSTRING_LEN(str2)] = '\0';
01244     OBJ_INFECT(str2, str);
01245     rb_enc_cr_str_copy_for_substr(str2, str);
01246 
01247     return str2;
01248 }
01249 
01250 /*
01251  *  call-seq:
01252  *     str % arg   -> new_str
01253  *
01254  *  Format---Uses <i>str</i> as a format specification, and returns the result
01255  *  of applying it to <i>arg</i>. If the format specification contains more than
01256  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01257  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01258  *  details of the format string.
01259  *
01260  *     "%05d" % 123                              #=> "00123"
01261  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01262  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01263  */
01264 
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268     volatile VALUE tmp = rb_check_array_type(arg);
01269 
01270     if (!NIL_P(tmp)) {
01271         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272     }
01273     return rb_str_format(1, &arg, str);
01274 }
01275 
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279     if (FL_TEST(str, STR_TMPLOCK)) {
01280         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281     }
01282     rb_check_frozen(str);
01283     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286 
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290     str_modifiable(str);
01291     if (!STR_SHARED_P(str)) return 1;
01292     if (STR_EMBED_P(str)) return 1;
01293     return 0;
01294 }
01295 
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299     char *ptr;
01300     long len = RSTRING_LEN(str);
01301     long capa = len + expand;
01302 
01303     if (len > capa) len = capa;
01304     ptr = ALLOC_N(char, capa + 1);
01305     if (RSTRING_PTR(str)) {
01306         memcpy(ptr, RSTRING_PTR(str), len);
01307     }
01308     STR_SET_NOEMBED(str);
01309     STR_UNSET_NOCAPA(str);
01310     ptr[len] = 0;
01311     RSTRING(str)->as.heap.ptr = ptr;
01312     RSTRING(str)->as.heap.len = len;
01313     RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315 
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317 
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321     if (!str_independent(str))
01322         str_make_independent(str);
01323     ENC_CODERANGE_CLEAR(str);
01324 }
01325 
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329     if (expand < 0) {
01330         rb_raise(rb_eArgError, "negative expanding string size");
01331     }
01332     if (!str_independent(str)) {
01333         str_make_independent_expand(str, expand);
01334     }
01335     else if (expand > 0) {
01336         long len = RSTRING_LEN(str);
01337         long capa = len + expand;
01338         if (!STR_EMBED_P(str)) {
01339             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340             RSTRING(str)->as.heap.aux.capa = capa;
01341         }
01342         else if (capa > RSTRING_EMBED_LEN_MAX) {
01343             str_make_independent_expand(str, expand);
01344         }
01345     }
01346     ENC_CODERANGE_CLEAR(str);
01347 }
01348 
01349 /* As rb_str_modify(), but don't clear coderange */
01350 static void
01351 str_modify_keep_cr(VALUE str)
01352 {
01353     if (!str_independent(str))
01354         str_make_independent(str);
01355     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01356         /* Force re-scan later */
01357         ENC_CODERANGE_CLEAR(str);
01358 }
01359 
01360 static inline void
01361 str_discard(VALUE str)
01362 {
01363     str_modifiable(str);
01364     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01365         xfree(RSTRING_PTR(str));
01366         RSTRING(str)->as.heap.ptr = 0;
01367         RSTRING(str)->as.heap.len = 0;
01368     }
01369 }
01370 
01371 void
01372 rb_str_associate(VALUE str, VALUE add)
01373 {
01374     /* sanity check */
01375     rb_check_frozen(str);
01376     if (STR_ASSOC_P(str)) {
01377         /* already associated */
01378         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01379     }
01380     else {
01381         if (STR_SHARED_P(str)) {
01382             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01383             str_make_independent(str);
01384             if (STR_ASSOC_P(assoc)) {
01385                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01386                 rb_ary_concat(assoc, add);
01387                 add = assoc;
01388             }
01389         }
01390         else if (STR_EMBED_P(str)) {
01391             str_make_independent(str);
01392         }
01393         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01394             RESIZE_CAPA(str, RSTRING_LEN(str));
01395         }
01396         FL_SET(str, STR_ASSOC);
01397         RBASIC(add)->klass = 0;
01398         RSTRING(str)->as.heap.aux.shared = add;
01399     }
01400 }
01401 
01402 VALUE
01403 rb_str_associated(VALUE str)
01404 {
01405     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01406     if (STR_ASSOC_P(str)) {
01407         return RSTRING(str)->as.heap.aux.shared;
01408     }
01409     return Qfalse;
01410 }
01411 
01412 VALUE
01413 rb_string_value(volatile VALUE *ptr)
01414 {
01415     VALUE s = *ptr;
01416     if (TYPE(s) != T_STRING) {
01417         s = rb_str_to_str(s);
01418         *ptr = s;
01419     }
01420     return s;
01421 }
01422 
01423 char *
01424 rb_string_value_ptr(volatile VALUE *ptr)
01425 {
01426     VALUE str = rb_string_value(ptr);
01427     return RSTRING_PTR(str);
01428 }
01429 
01430 char *
01431 rb_string_value_cstr(volatile VALUE *ptr)
01432 {
01433     VALUE str = rb_string_value(ptr);
01434     char *s = RSTRING_PTR(str);
01435     long len = RSTRING_LEN(str);
01436 
01437     if (!s || memchr(s, 0, len)) {
01438         rb_raise(rb_eArgError, "string contains null byte");
01439     }
01440     if (s[len]) {
01441         rb_str_modify(str);
01442         s = RSTRING_PTR(str);
01443         s[RSTRING_LEN(str)] = 0;
01444     }
01445     return s;
01446 }
01447 
01448 VALUE
01449 rb_check_string_type(VALUE str)
01450 {
01451     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01452     return str;
01453 }
01454 
01455 /*
01456  *  call-seq:
01457  *     String.try_convert(obj) -> string or nil
01458  *
01459  *  Try to convert <i>obj</i> into a String, using to_str method.
01460  *  Returns converted string or nil if <i>obj</i> cannot be converted
01461  *  for any reason.
01462  *
01463  *     String.try_convert("str")     #=> "str"
01464  *     String.try_convert(/re/)      #=> nil
01465  */
01466 static VALUE
01467 rb_str_s_try_convert(VALUE dummy, VALUE str)
01468 {
01469     return rb_check_string_type(str);
01470 }
01471 
01472 static char*
01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01474 {
01475     long nth = *nthp;
01476     if (rb_enc_mbmaxlen(enc) == 1) {
01477         p += nth;
01478     }
01479     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01480         p += nth * rb_enc_mbmaxlen(enc);
01481     }
01482     else if (rb_enc_asciicompat(enc)) {
01483         const char *p2, *e2;
01484         int n;
01485 
01486         while (p < e && 0 < nth) {
01487             e2 = p + nth;
01488             if (e < e2) {
01489                 *nthp = nth;
01490                 return (char *)e;
01491             }
01492             if (ISASCII(*p)) {
01493                 p2 = search_nonascii(p, e2);
01494                 if (!p2) {
01495                     *nthp = nth;
01496                     return (char *)e2;
01497                 }
01498                 nth -= p2 - p;
01499                 p = p2;
01500             }
01501             n = rb_enc_mbclen(p, e, enc);
01502             p += n;
01503             nth--;
01504         }
01505         *nthp = nth;
01506         if (nth != 0) {
01507             return (char *)e;
01508         }
01509         return (char *)p;
01510     }
01511     else {
01512         while (p < e && nth--) {
01513             p += rb_enc_mbclen(p, e, enc);
01514         }
01515     }
01516     if (p > e) p = e;
01517     *nthp = nth;
01518     return (char*)p;
01519 }
01520 
01521 char*
01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01523 {
01524     return str_nth_len(p, e, &nth, enc);
01525 }
01526 
01527 static char*
01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01529 {
01530     if (singlebyte)
01531         p += nth;
01532     else {
01533         p = str_nth_len(p, e, &nth, enc);
01534     }
01535     if (!p) return 0;
01536     if (p > e) p = e;
01537     return (char *)p;
01538 }
01539 
01540 /* char offset to byte offset */
01541 static long
01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01543 {
01544     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01545     if (!pp) return e - p;
01546     return pp - p;
01547 }
01548 
01549 long
01550 rb_str_offset(VALUE str, long pos)
01551 {
01552     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01553                       STR_ENC_GET(str), single_byte_optimizable(str));
01554 }
01555 
01556 #ifdef NONASCII_MASK
01557 static char *
01558 str_utf8_nth(const char *p, const char *e, long *nthp)
01559 {
01560     long nth = *nthp;
01561     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01562         const VALUE *s, *t;
01563         const VALUE lowbits = sizeof(VALUE) - 1;
01564         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01565         t = (const VALUE*)(~lowbits & (VALUE)e);
01566         while (p < (const char *)s) {
01567             if (is_utf8_lead_byte(*p)) nth--;
01568             p++;
01569         }
01570         do {
01571             nth -= count_utf8_lead_bytes_with_word(s);
01572             s++;
01573         } while (s < t && (int)sizeof(VALUE) <= nth);
01574         p = (char *)s;
01575     }
01576     while (p < e) {
01577         if (is_utf8_lead_byte(*p)) {
01578             if (nth == 0) break;
01579             nth--;
01580         }
01581         p++;
01582     }
01583     *nthp = nth;
01584     return (char *)p;
01585 }
01586 
01587 static long
01588 str_utf8_offset(const char *p, const char *e, long nth)
01589 {
01590     const char *pp = str_utf8_nth(p, e, &nth);
01591     return pp - p;
01592 }
01593 #endif
01594 
01595 /* byte offset to char offset */
01596 long
01597 rb_str_sublen(VALUE str, long pos)
01598 {
01599     if (single_byte_optimizable(str) || pos < 0)
01600         return pos;
01601     else {
01602         char *p = RSTRING_PTR(str);
01603         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01604     }
01605 }
01606 
01607 VALUE
01608 rb_str_subseq(VALUE str, long beg, long len)
01609 {
01610     VALUE str2;
01611 
01612     if (RSTRING_LEN(str) == beg + len &&
01613         RSTRING_EMBED_LEN_MAX < len) {
01614         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01615         rb_str_drop_bytes(str2, beg);
01616     }
01617     else {
01618         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01619     }
01620 
01621     rb_enc_cr_str_copy_for_substr(str2, str);
01622     OBJ_INFECT(str2, str);
01623 
01624     return str2;
01625 }
01626 
01627 VALUE
01628 rb_str_substr(VALUE str, long beg, long len)
01629 {
01630     rb_encoding *enc = STR_ENC_GET(str);
01631     VALUE str2;
01632     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01633 
01634     if (len < 0) return Qnil;
01635     if (!RSTRING_LEN(str)) {
01636         len = 0;
01637     }
01638     if (single_byte_optimizable(str)) {
01639         if (beg > RSTRING_LEN(str)) return Qnil;
01640         if (beg < 0) {
01641             beg += RSTRING_LEN(str);
01642             if (beg < 0) return Qnil;
01643         }
01644         if (beg + len > RSTRING_LEN(str))
01645             len = RSTRING_LEN(str) - beg;
01646         if (len <= 0) {
01647             len = 0;
01648             p = 0;
01649         }
01650         else
01651             p = s + beg;
01652         goto sub;
01653     }
01654     if (beg < 0) {
01655         if (len > -beg) len = -beg;
01656         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01657             beg = -beg;
01658             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01659             p = e;
01660             if (!p) return Qnil;
01661             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01662             if (!p) return Qnil;
01663             len = e - p;
01664             goto sub;
01665         }
01666         else {
01667             beg += str_strlen(str, enc);
01668             if (beg < 0) return Qnil;
01669         }
01670     }
01671     else if (beg > 0 && beg > RSTRING_LEN(str)) {
01672         return Qnil;
01673     }
01674     if (len == 0) {
01675         if (beg > str_strlen(str, enc)) return Qnil;
01676         p = 0;
01677     }
01678 #ifdef NONASCII_MASK
01679     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01680         enc == rb_utf8_encoding()) {
01681         p = str_utf8_nth(s, e, &beg);
01682         if (beg > 0) return Qnil;
01683         len = str_utf8_offset(p, e, len);
01684     }
01685 #endif
01686     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01687         int char_sz = rb_enc_mbmaxlen(enc);
01688 
01689         p = s + beg * char_sz;
01690         if (p > e) {
01691             return Qnil;
01692         }
01693         else if (len * char_sz > e - p)
01694             len = e - p;
01695         else
01696             len *= char_sz;
01697     }
01698     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01699         if (beg > 0) return Qnil;
01700         len = 0;
01701     }
01702     else {
01703         len = str_offset(p, e, len, enc, 0);
01704     }
01705   sub:
01706     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01707         str2 = rb_str_new4(str);
01708         str2 = str_new3(rb_obj_class(str2), str2);
01709         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01710         RSTRING(str2)->as.heap.len = len;
01711     }
01712     else {
01713         str2 = rb_str_new5(str, p, len);
01714         rb_enc_cr_str_copy_for_substr(str2, str);
01715         OBJ_INFECT(str2, str);
01716     }
01717 
01718     return str2;
01719 }
01720 
01721 VALUE
01722 rb_str_freeze(VALUE str)
01723 {
01724     if (STR_ASSOC_P(str)) {
01725         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01726         OBJ_FREEZE(ary);
01727     }
01728     return rb_obj_freeze(str);
01729 }
01730 
01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01732 #define rb_str_dup_frozen rb_str_new_frozen
01733 
01734 VALUE
01735 rb_str_locktmp(VALUE str)
01736 {
01737     if (FL_TEST(str, STR_TMPLOCK)) {
01738         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01739     }
01740     FL_SET(str, STR_TMPLOCK);
01741     return str;
01742 }
01743 
01744 VALUE
01745 rb_str_unlocktmp(VALUE str)
01746 {
01747     if (!FL_TEST(str, STR_TMPLOCK)) {
01748         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01749     }
01750     FL_UNSET(str, STR_TMPLOCK);
01751     return str;
01752 }
01753 
01754 void
01755 rb_str_set_len(VALUE str, long len)
01756 {
01757     long capa;
01758 
01759     str_modifiable(str);
01760     if (STR_SHARED_P(str)) {
01761         rb_raise(rb_eRuntimeError, "can't set length of shared string");
01762     }
01763     if (len > (capa = (long)rb_str_capacity(str))) {
01764         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01765     }
01766     STR_SET_LEN(str, len);
01767     RSTRING_PTR(str)[len] = '\0';
01768 }
01769 
01770 VALUE
01771 rb_str_resize(VALUE str, long len)
01772 {
01773     long slen;
01774     int independent;
01775 
01776     if (len < 0) {
01777         rb_raise(rb_eArgError, "negative string size (or size too big)");
01778     }
01779 
01780     independent = str_independent(str);
01781     ENC_CODERANGE_CLEAR(str);
01782     slen = RSTRING_LEN(str);
01783     if (len != slen) {
01784         if (STR_EMBED_P(str)) {
01785             if (len <= RSTRING_EMBED_LEN_MAX) {
01786                 STR_SET_EMBED_LEN(str, len);
01787                 RSTRING(str)->as.ary[len] = '\0';
01788                 return str;
01789             }
01790             str_make_independent_expand(str, len - slen);
01791             STR_SET_NOEMBED(str);
01792         }
01793         else if (len <= RSTRING_EMBED_LEN_MAX) {
01794             char *ptr = RSTRING(str)->as.heap.ptr;
01795             STR_SET_EMBED(str);
01796             if (slen > len) slen = len;
01797             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01798             RSTRING(str)->as.ary[len] = '\0';
01799             STR_SET_EMBED_LEN(str, len);
01800             if (independent) xfree(ptr);
01801             return str;
01802         }
01803         else if (!independent) {
01804             str_make_independent_expand(str, len - slen);
01805         }
01806         else if (slen < len || slen - len > 1024) {
01807             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01808         }
01809         if (!STR_NOCAPA_P(str)) {
01810             RSTRING(str)->as.heap.aux.capa = len;
01811         }
01812         RSTRING(str)->as.heap.len = len;
01813         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01814     }
01815     return str;
01816 }
01817 
01818 static VALUE
01819 str_buf_cat(VALUE str, const char *ptr, long len)
01820 {
01821     long capa, total, off = -1;
01822 
01823     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01824         off = ptr - RSTRING_PTR(str);
01825     }
01826     rb_str_modify(str);
01827     if (len == 0) return 0;
01828     if (STR_ASSOC_P(str)) {
01829         FL_UNSET(str, STR_ASSOC);
01830         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01831     }
01832     else if (STR_EMBED_P(str)) {
01833         capa = RSTRING_EMBED_LEN_MAX;
01834     }
01835     else {
01836         capa = RSTRING(str)->as.heap.aux.capa;
01837     }
01838     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01839         rb_raise(rb_eArgError, "string sizes too big");
01840     }
01841     total = RSTRING_LEN(str)+len;
01842     if (capa <= total) {
01843         while (total > capa) {
01844             if (capa + 1 >= LONG_MAX / 2) {
01845                 capa = (total + 4095) / 4096;
01846                 break;
01847             }
01848             capa = (capa + 1) * 2;
01849         }
01850         RESIZE_CAPA(str, capa);
01851     }
01852     if (off != -1) {
01853         ptr = RSTRING_PTR(str) + off;
01854     }
01855     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01856     STR_SET_LEN(str, total);
01857     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01858 
01859     return str;
01860 }
01861 
01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01863 
01864 VALUE
01865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01866 {
01867     if (len == 0) return str;
01868     if (len < 0) {
01869         rb_raise(rb_eArgError, "negative string size (or size too big)");
01870     }
01871     return str_buf_cat(str, ptr, len);
01872 }
01873 
01874 VALUE
01875 rb_str_buf_cat2(VALUE str, const char *ptr)
01876 {
01877     return rb_str_buf_cat(str, ptr, strlen(ptr));
01878 }
01879 
01880 VALUE
01881 rb_str_cat(VALUE str, const char *ptr, long len)
01882 {
01883     if (len < 0) {
01884         rb_raise(rb_eArgError, "negative string size (or size too big)");
01885     }
01886     if (STR_ASSOC_P(str)) {
01887         char *p;
01888         rb_str_modify_expand(str, len);
01889         p = RSTRING(str)->as.heap.ptr;
01890         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01891         len = RSTRING(str)->as.heap.len += len;
01892         p[len] = '\0'; /* sentinel */
01893         return str;
01894     }
01895 
01896     return rb_str_buf_cat(str, ptr, len);
01897 }
01898 
01899 VALUE
01900 rb_str_cat2(VALUE str, const char *ptr)
01901 {
01902     return rb_str_cat(str, ptr, strlen(ptr));
01903 }
01904 
01905 static VALUE
01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01907     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01908 {
01909     int str_encindex = ENCODING_GET(str);
01910     int res_encindex;
01911     int str_cr, res_cr;
01912 
01913     str_cr = ENC_CODERANGE(str);
01914 
01915     if (str_encindex == ptr_encindex) {
01916         if (str_cr == ENC_CODERANGE_UNKNOWN)
01917             ptr_cr = ENC_CODERANGE_UNKNOWN;
01918         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01919             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01920         }
01921     }
01922     else {
01923         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01924         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01925         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01926             if (len == 0)
01927                 return str;
01928             if (RSTRING_LEN(str) == 0) {
01929                 rb_str_buf_cat(str, ptr, len);
01930                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01931                 return str;
01932             }
01933             goto incompatible;
01934         }
01935         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01936             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01937         }
01938         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01939             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01940                 str_cr = rb_enc_str_coderange(str);
01941             }
01942         }
01943     }
01944     if (ptr_cr_ret)
01945         *ptr_cr_ret = ptr_cr;
01946 
01947     if (str_encindex != ptr_encindex &&
01948         str_cr != ENC_CODERANGE_7BIT &&
01949         ptr_cr != ENC_CODERANGE_7BIT) {
01950       incompatible:
01951         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01952             rb_enc_name(rb_enc_from_index(str_encindex)),
01953             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01954     }
01955 
01956     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01957         res_encindex = str_encindex;
01958         res_cr = ENC_CODERANGE_UNKNOWN;
01959     }
01960     else if (str_cr == ENC_CODERANGE_7BIT) {
01961         if (ptr_cr == ENC_CODERANGE_7BIT) {
01962             res_encindex = str_encindex;
01963             res_cr = ENC_CODERANGE_7BIT;
01964         }
01965         else {
01966             res_encindex = ptr_encindex;
01967             res_cr = ptr_cr;
01968         }
01969     }
01970     else if (str_cr == ENC_CODERANGE_VALID) {
01971         res_encindex = str_encindex;
01972         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01973             res_cr = str_cr;
01974         else
01975             res_cr = ptr_cr;
01976     }
01977     else { /* str_cr == ENC_CODERANGE_BROKEN */
01978         res_encindex = str_encindex;
01979         res_cr = str_cr;
01980         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01981     }
01982 
01983     if (len < 0) {
01984         rb_raise(rb_eArgError, "negative string size (or size too big)");
01985     }
01986     str_buf_cat(str, ptr, len);
01987     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01988     return str;
01989 }
01990 
01991 VALUE
01992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01993 {
01994     return rb_enc_cr_str_buf_cat(str, ptr, len,
01995         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01996 }
01997 
01998 VALUE
01999 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02000 {
02001     /* ptr must reference NUL terminated ASCII string. */
02002     int encindex = ENCODING_GET(str);
02003     rb_encoding *enc = rb_enc_from_index(encindex);
02004     if (rb_enc_asciicompat(enc)) {
02005         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02006             encindex, ENC_CODERANGE_7BIT, 0);
02007     }
02008     else {
02009         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02010         while (*ptr) {
02011             unsigned int c = (unsigned char)*ptr;
02012             int len = rb_enc_codelen(c, enc);
02013             rb_enc_mbcput(c, buf, enc);
02014             rb_enc_cr_str_buf_cat(str, buf, len,
02015                 encindex, ENC_CODERANGE_VALID, 0);
02016             ptr++;
02017         }
02018         return str;
02019     }
02020 }
02021 
02022 VALUE
02023 rb_str_buf_append(VALUE str, VALUE str2)
02024 {
02025     int str2_cr;
02026 
02027     str2_cr = ENC_CODERANGE(str2);
02028 
02029     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02030         ENCODING_GET(str2), str2_cr, &str2_cr);
02031 
02032     OBJ_INFECT(str, str2);
02033     ENC_CODERANGE_SET(str2, str2_cr);
02034 
02035     return str;
02036 }
02037 
02038 VALUE
02039 rb_str_append(VALUE str, VALUE str2)
02040 {
02041     rb_encoding *enc;
02042     int cr, cr2;
02043     long len2;
02044 
02045     StringValue(str2);
02046     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02047         long len = RSTRING_LEN(str) + len2;
02048         enc = rb_enc_check(str, str2);
02049         cr = ENC_CODERANGE(str);
02050         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02051         rb_str_modify_expand(str, len2);
02052         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02053                RSTRING_PTR(str2), len2+1);
02054         RSTRING(str)->as.heap.len = len;
02055         rb_enc_associate(str, enc);
02056         ENC_CODERANGE_SET(str, cr);
02057         OBJ_INFECT(str, str2);
02058         return str;
02059     }
02060     return rb_str_buf_append(str, str2);
02061 }
02062 
02063 /*
02064  *  call-seq:
02065  *     str << integer       -> str
02066  *     str.concat(integer)  -> str
02067  *     str << obj           -> str
02068  *     str.concat(obj)      -> str
02069  *
02070  *  Append---Concatenates the given object to <i>str</i>. If the object is a
02071  *  <code>Integer</code>, it is considered as a codepoint, and is converted
02072  *  to a character before concatenation.
02073  *
02074  *     a = "hello "
02075  *     a << "world"   #=> "hello world"
02076  *     a.concat(33)   #=> "hello world!"
02077  */
02078 
02079 VALUE
02080 rb_str_concat(VALUE str1, VALUE str2)
02081 {
02082     unsigned int code;
02083     rb_encoding *enc = STR_ENC_GET(str1);
02084 
02085     if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02086         if (rb_num_to_uint(str2, &code) == 0) {
02087         }
02088         else if (FIXNUM_P(str2)) {
02089             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02090         }
02091         else {
02092             rb_raise(rb_eRangeError, "bignum out of char range");
02093         }
02094     }
02095     else {
02096         return rb_str_append(str1, str2);
02097     }
02098 
02099     if (enc == rb_usascii_encoding()) {
02100         /* US-ASCII automatically extended to ASCII-8BIT */
02101         char buf[1];
02102         buf[0] = (char)code;
02103         if (code > 0xFF) {
02104             rb_raise(rb_eRangeError, "%u out of char range", code);
02105         }
02106         rb_str_cat(str1, buf, 1);
02107         if (code > 127) {
02108             rb_enc_associate(str1, rb_ascii8bit_encoding());
02109             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02110         }
02111     }
02112     else {
02113         long pos = RSTRING_LEN(str1);
02114         int cr = ENC_CODERANGE(str1);
02115         int len;
02116         char *buf;
02117 
02118         switch (len = rb_enc_codelen(code, enc)) {
02119           case ONIGERR_INVALID_CODE_POINT_VALUE:
02120             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02121             break;
02122           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02123           case 0:
02124             rb_raise(rb_eRangeError, "%u out of char range", code);
02125             break;
02126         }
02127         buf = ALLOCA_N(char, len + 1);
02128         rb_enc_mbcput(code, buf, enc);
02129         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02130             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02131         }
02132         rb_str_resize(str1, pos+len);
02133         strncpy(RSTRING_PTR(str1) + pos, buf, len);
02134         if (cr == ENC_CODERANGE_7BIT && code > 127)
02135             cr = ENC_CODERANGE_VALID;
02136         ENC_CODERANGE_SET(str1, cr);
02137     }
02138     return str1;
02139 }
02140 
02141 /*
02142  *  call-seq:
02143  *     str.prepend(other_str)  -> str
02144  *
02145  *  Prepend---Prepend the given string to <i>str</i>.
02146  *
02147  *  a = "world"
02148  *  a.prepend("hello ") #=> "hello world"
02149  *  a                   #=> "hello world"
02150  */
02151 
02152 static VALUE
02153 rb_str_prepend(VALUE str, VALUE str2)
02154 {
02155     StringValue(str2);
02156     StringValue(str);
02157     rb_str_update(str, 0L, 0L, str2);
02158     return str;
02159 }
02160 
02161 st_index_t
02162 rb_str_hash(VALUE str)
02163 {
02164     int e = ENCODING_GET(str);
02165     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02166         e = 0;
02167     }
02168     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02169 }
02170 
02171 int
02172 rb_str_hash_cmp(VALUE str1, VALUE str2)
02173 {
02174     long len;
02175 
02176     if (!rb_str_comparable(str1, str2)) return 1;
02177     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02178         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02179         return 0;
02180     }
02181     return 1;
02182 }
02183 
02184 /*
02185  * call-seq:
02186  *    str.hash   -> fixnum
02187  *
02188  * Return a hash based on the string's length and content.
02189  */
02190 
02191 static VALUE
02192 rb_str_hash_m(VALUE str)
02193 {
02194     st_index_t hval = rb_str_hash(str);
02195     return INT2FIX(hval);
02196 }
02197 
02198 #define lesser(a,b) (((a)>(b))?(b):(a))
02199 
02200 int
02201 rb_str_comparable(VALUE str1, VALUE str2)
02202 {
02203     int idx1, idx2;
02204     int rc1, rc2;
02205 
02206     if (RSTRING_LEN(str1) == 0) return TRUE;
02207     if (RSTRING_LEN(str2) == 0) return TRUE;
02208     idx1 = ENCODING_GET(str1);
02209     idx2 = ENCODING_GET(str2);
02210     if (idx1 == idx2) return TRUE;
02211     rc1 = rb_enc_str_coderange(str1);
02212     rc2 = rb_enc_str_coderange(str2);
02213     if (rc1 == ENC_CODERANGE_7BIT) {
02214         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02215         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02216             return TRUE;
02217     }
02218     if (rc2 == ENC_CODERANGE_7BIT) {
02219         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02220             return TRUE;
02221     }
02222     return FALSE;
02223 }
02224 
02225 int
02226 rb_str_cmp(VALUE str1, VALUE str2)
02227 {
02228     long len1, len2;
02229     const char *ptr1, *ptr2;
02230     int retval;
02231 
02232     if (str1 == str2) return 0;
02233     RSTRING_GETMEM(str1, ptr1, len1);
02234     RSTRING_GETMEM(str2, ptr2, len2);
02235     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02236         if (len1 == len2) {
02237             if (!rb_str_comparable(str1, str2)) {
02238                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02239                     return 1;
02240                 return -1;
02241             }
02242             return 0;
02243         }
02244         if (len1 > len2) return 1;
02245         return -1;
02246     }
02247     if (retval > 0) return 1;
02248     return -1;
02249 }
02250 
02251 /* expect tail call optimization */
02252 static VALUE
02253 str_eql(const VALUE str1, const VALUE str2)
02254 {
02255     const long len = RSTRING_LEN(str1);
02256     const char *ptr1, *ptr2;
02257 
02258     if (len != RSTRING_LEN(str2)) return Qfalse;
02259     if (!rb_str_comparable(str1, str2)) return Qfalse;
02260     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02261         return Qtrue;
02262     if (memcmp(ptr1, ptr2, len) == 0)
02263         return Qtrue;
02264     return Qfalse;
02265 }
02266 /*
02267  *  call-seq:
02268  *     str == obj   -> true or false
02269  *
02270  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02271  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02272  *  <code><=></code> <i>obj</i> returns zero.
02273  */
02274 
02275 VALUE
02276 rb_str_equal(VALUE str1, VALUE str2)
02277 {
02278     if (str1 == str2) return Qtrue;
02279     if (TYPE(str2) != T_STRING) {
02280         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02281             return Qfalse;
02282         }
02283         return rb_equal(str2, str1);
02284     }
02285     return str_eql(str1, str2);
02286 }
02287 
02288 /*
02289  * call-seq:
02290  *   str.eql?(other)   -> true or false
02291  *
02292  * Two strings are equal if they have the same length and content.
02293  */
02294 
02295 static VALUE
02296 rb_str_eql(VALUE str1, VALUE str2)
02297 {
02298     if (str1 == str2) return Qtrue;
02299     if (TYPE(str2) != T_STRING) return Qfalse;
02300     return str_eql(str1, str2);
02301 }
02302 
02303 /*
02304  *  call-seq:
02305  *     str <=> other_str   -> -1, 0, +1 or nil
02306  *
02307  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02308  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02309  *  <i>str</i>. If the strings are of different lengths, and the strings are
02310  *  equal when compared up to the shortest length, then the longer string is
02311  *  considered greater than the shorter one. In older versions of Ruby, setting
02312  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02313  *  in favor of using <code>String#casecmp</code>.
02314  *
02315  *  <code><=></code> is the basis for the methods <code><</code>,
02316  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02317  *  included from module <code>Comparable</code>.  The method
02318  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02319  *
02320  *     "abcdef" <=> "abcde"     #=> 1
02321  *     "abcdef" <=> "abcdef"    #=> 0
02322  *     "abcdef" <=> "abcdefg"   #=> -1
02323  *     "abcdef" <=> "ABCDEF"    #=> 1
02324  */
02325 
02326 static VALUE
02327 rb_str_cmp_m(VALUE str1, VALUE str2)
02328 {
02329     long result;
02330 
02331     if (TYPE(str2) != T_STRING) {
02332         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02333             return Qnil;
02334         }
02335         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02336             return Qnil;
02337         }
02338         else {
02339             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02340 
02341             if (NIL_P(tmp)) return Qnil;
02342             if (!FIXNUM_P(tmp)) {
02343                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02344             }
02345             result = -FIX2LONG(tmp);
02346         }
02347     }
02348     else {
02349         result = rb_str_cmp(str1, str2);
02350     }
02351     return LONG2NUM(result);
02352 }
02353 
02354 /*
02355  *  call-seq:
02356  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02357  *
02358  *  Case-insensitive version of <code>String#<=></code>.
02359  *
02360  *     "abcdef".casecmp("abcde")     #=> 1
02361  *     "aBcDeF".casecmp("abcdef")    #=> 0
02362  *     "abcdef".casecmp("abcdefg")   #=> -1
02363  *     "abcdef".casecmp("ABCDEF")    #=> 0
02364  */
02365 
02366 static VALUE
02367 rb_str_casecmp(VALUE str1, VALUE str2)
02368 {
02369     long len;
02370     rb_encoding *enc;
02371     char *p1, *p1end, *p2, *p2end;
02372 
02373     StringValue(str2);
02374     enc = rb_enc_compatible(str1, str2);
02375     if (!enc) {
02376         return Qnil;
02377     }
02378 
02379     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02380     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02381     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02382         while (p1 < p1end && p2 < p2end) {
02383             if (*p1 != *p2) {
02384                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02385                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02386                 if (c1 != c2)
02387                     return INT2FIX(c1 < c2 ? -1 : 1);
02388             }
02389             p1++;
02390             p2++;
02391         }
02392     }
02393     else {
02394         while (p1 < p1end && p2 < p2end) {
02395             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02396             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02397 
02398             if (0 <= c1 && 0 <= c2) {
02399                 c1 = TOUPPER(c1);
02400                 c2 = TOUPPER(c2);
02401                 if (c1 != c2)
02402                     return INT2FIX(c1 < c2 ? -1 : 1);
02403             }
02404             else {
02405                 int r;
02406                 l1 = rb_enc_mbclen(p1, p1end, enc);
02407                 l2 = rb_enc_mbclen(p2, p2end, enc);
02408                 len = l1 < l2 ? l1 : l2;
02409                 r = memcmp(p1, p2, len);
02410                 if (r != 0)
02411                     return INT2FIX(r < 0 ? -1 : 1);
02412                 if (l1 != l2)
02413                     return INT2FIX(l1 < l2 ? -1 : 1);
02414             }
02415             p1 += l1;
02416             p2 += l2;
02417         }
02418     }
02419     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02420     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02421     return INT2FIX(-1);
02422 }
02423 
02424 static long
02425 rb_str_index(VALUE str, VALUE sub, long offset)
02426 {
02427     long pos;
02428     char *s, *sptr, *e;
02429     long len, slen;
02430     rb_encoding *enc;
02431 
02432     enc = rb_enc_check(str, sub);
02433     if (is_broken_string(sub)) {
02434         return -1;
02435     }
02436     len = str_strlen(str, enc);
02437     slen = str_strlen(sub, enc);
02438     if (offset < 0) {
02439         offset += len;
02440         if (offset < 0) return -1;
02441     }
02442     if (len - offset < slen) return -1;
02443     s = RSTRING_PTR(str);
02444     e = s + RSTRING_LEN(str);
02445     if (offset) {
02446         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02447         s += offset;
02448     }
02449     if (slen == 0) return offset;
02450     /* need proceed one character at a time */
02451     sptr = RSTRING_PTR(sub);
02452     slen = RSTRING_LEN(sub);
02453     len = RSTRING_LEN(str) - offset;
02454     for (;;) {
02455         char *t;
02456         pos = rb_memsearch(sptr, slen, s, len, enc);
02457         if (pos < 0) return pos;
02458         t = rb_enc_right_char_head(s, s+pos, e, enc);
02459         if (t == s + pos) break;
02460         if ((len -= t - s) <= 0) return -1;
02461         offset += t - s;
02462         s = t;
02463     }
02464     return pos + offset;
02465 }
02466 
02467 
02468 /*
02469  *  call-seq:
02470  *     str.index(substring [, offset])   -> fixnum or nil
02471  *     str.index(regexp [, offset])      -> fixnum or nil
02472  *
02473  *  Returns the index of the first occurrence of the given <i>substring</i> or
02474  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02475  *  found. If the second parameter is present, it specifies the position in the
02476  *  string to begin the search.
02477  *
02478  *     "hello".index('e')             #=> 1
02479  *     "hello".index('lo')            #=> 3
02480  *     "hello".index('a')             #=> nil
02481  *     "hello".index(?e)              #=> 1
02482  *     "hello".index(/[aeiou]/, -3)   #=> 4
02483  */
02484 
02485 static VALUE
02486 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02487 {
02488     VALUE sub;
02489     VALUE initpos;
02490     long pos;
02491 
02492     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02493         pos = NUM2LONG(initpos);
02494     }
02495     else {
02496         pos = 0;
02497     }
02498     if (pos < 0) {
02499         pos += str_strlen(str, STR_ENC_GET(str));
02500         if (pos < 0) {
02501             if (TYPE(sub) == T_REGEXP) {
02502                 rb_backref_set(Qnil);
02503             }
02504             return Qnil;
02505         }
02506     }
02507 
02508     switch (TYPE(sub)) {
02509       case T_REGEXP:
02510         if (pos > str_strlen(str, STR_ENC_GET(str)))
02511             return Qnil;
02512         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02513                          rb_enc_check(str, sub), single_byte_optimizable(str));
02514 
02515         pos = rb_reg_search(sub, str, pos, 0);
02516         pos = rb_str_sublen(str, pos);
02517         break;
02518 
02519       default: {
02520         VALUE tmp;
02521 
02522         tmp = rb_check_string_type(sub);
02523         if (NIL_P(tmp)) {
02524             rb_raise(rb_eTypeError, "type mismatch: %s given",
02525                      rb_obj_classname(sub));
02526         }
02527         sub = tmp;
02528       }
02529         /* fall through */
02530       case T_STRING:
02531         pos = rb_str_index(str, sub, pos);
02532         pos = rb_str_sublen(str, pos);
02533         break;
02534     }
02535 
02536     if (pos == -1) return Qnil;
02537     return LONG2NUM(pos);
02538 }
02539 
02540 static long
02541 rb_str_rindex(VALUE str, VALUE sub, long pos)
02542 {
02543     long len, slen;
02544     char *s, *sbeg, *e, *t;
02545     rb_encoding *enc;
02546     int singlebyte = single_byte_optimizable(str);
02547 
02548     enc = rb_enc_check(str, sub);
02549     if (is_broken_string(sub)) {
02550         return -1;
02551     }
02552     len = str_strlen(str, enc);
02553     slen = str_strlen(sub, enc);
02554     /* substring longer than string */
02555     if (len < slen) return -1;
02556     if (len - pos < slen) {
02557         pos = len - slen;
02558     }
02559     if (len == 0) {
02560         return pos;
02561     }
02562     sbeg = RSTRING_PTR(str);
02563     e = RSTRING_END(str);
02564     t = RSTRING_PTR(sub);
02565     slen = RSTRING_LEN(sub);
02566     s = str_nth(sbeg, e, pos, enc, singlebyte);
02567     while (s) {
02568         if (memcmp(s, t, slen) == 0) {
02569             return pos;
02570         }
02571         if (pos == 0) break;
02572         pos--;
02573         s = rb_enc_prev_char(sbeg, s, e, enc);
02574     }
02575     return -1;
02576 }
02577 
02578 
02579 /*
02580  *  call-seq:
02581  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02582  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02583  *
02584  *  Returns the index of the last occurrence of the given <i>substring</i> or
02585  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02586  *  found. If the second parameter is present, it specifies the position in the
02587  *  string to end the search---characters beyond this point will not be
02588  *  considered.
02589  *
02590  *     "hello".rindex('e')             #=> 1
02591  *     "hello".rindex('l')             #=> 3
02592  *     "hello".rindex('a')             #=> nil
02593  *     "hello".rindex(?e)              #=> 1
02594  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02595  */
02596 
02597 static VALUE
02598 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02599 {
02600     VALUE sub;
02601     VALUE vpos;
02602     rb_encoding *enc = STR_ENC_GET(str);
02603     long pos, len = str_strlen(str, enc);
02604 
02605     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02606         pos = NUM2LONG(vpos);
02607         if (pos < 0) {
02608             pos += len;
02609             if (pos < 0) {
02610                 if (TYPE(sub) == T_REGEXP) {
02611                     rb_backref_set(Qnil);
02612                 }
02613                 return Qnil;
02614             }
02615         }
02616         if (pos > len) pos = len;
02617     }
02618     else {
02619         pos = len;
02620     }
02621 
02622     switch (TYPE(sub)) {
02623       case T_REGEXP:
02624         /* enc = rb_get_check(str, sub); */
02625         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02626                          STR_ENC_GET(str), single_byte_optimizable(str));
02627 
02628         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02629             pos = rb_reg_search(sub, str, pos, 1);
02630             pos = rb_str_sublen(str, pos);
02631         }
02632         if (pos >= 0) return LONG2NUM(pos);
02633         break;
02634 
02635       default: {
02636         VALUE tmp;
02637 
02638         tmp = rb_check_string_type(sub);
02639         if (NIL_P(tmp)) {
02640             rb_raise(rb_eTypeError, "type mismatch: %s given",
02641                      rb_obj_classname(sub));
02642         }
02643         sub = tmp;
02644       }
02645         /* fall through */
02646       case T_STRING:
02647         pos = rb_str_rindex(str, sub, pos);
02648         if (pos >= 0) return LONG2NUM(pos);
02649         break;
02650     }
02651     return Qnil;
02652 }
02653 
02654 /*
02655  *  call-seq:
02656  *     str =~ obj   -> fixnum or nil
02657  *
02658  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02659  *  against <i>str</i>,and returns the position the match starts, or
02660  *  <code>nil</code> if there is no match. Otherwise, invokes
02661  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02662  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02663  *
02664  *     "cat o' 9 tails" =~ /\d/   #=> 7
02665  *     "cat o' 9 tails" =~ 9      #=> nil
02666  */
02667 
02668 static VALUE
02669 rb_str_match(VALUE x, VALUE y)
02670 {
02671     switch (TYPE(y)) {
02672       case T_STRING:
02673         rb_raise(rb_eTypeError, "type mismatch: String given");
02674 
02675       case T_REGEXP:
02676         return rb_reg_match(y, x);
02677 
02678       default:
02679         return rb_funcall(y, rb_intern("=~"), 1, x);
02680     }
02681 }
02682 
02683 
02684 static VALUE get_pat(VALUE, int);
02685 
02686 
02687 /*
02688  *  call-seq:
02689  *     str.match(pattern)        -> matchdata or nil
02690  *     str.match(pattern, pos)   -> matchdata or nil
02691  *
02692  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02693  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02694  *  parameter is present, it specifies the position in the string to begin the
02695  *  search.
02696  *
02697  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02698  *     'hello'.match('(.)\1')[0]   #=> "ll"
02699  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02700  *     'hello'.match('xx')         #=> nil
02701  *
02702  *  If a block is given, invoke the block with MatchData if match succeed, so
02703  *  that you can write
02704  *
02705  *     str.match(pat) {|m| ...}
02706  *
02707  *  instead of
02708  *
02709  *     if m = str.match(pat)
02710  *       ...
02711  *     end
02712  *
02713  *  The return value is a value from block execution in this case.
02714  */
02715 
02716 static VALUE
02717 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02718 {
02719     VALUE re, result;
02720     if (argc < 1)
02721        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02722     re = argv[0];
02723     argv[0] = str;
02724     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02725     if (!NIL_P(result) && rb_block_given_p()) {
02726         return rb_yield(result);
02727     }
02728     return result;
02729 }
02730 
02731 enum neighbor_char {
02732     NEIGHBOR_NOT_CHAR,
02733     NEIGHBOR_FOUND,
02734     NEIGHBOR_WRAPPED
02735 };
02736 
02737 static enum neighbor_char
02738 enc_succ_char(char *p, long len, rb_encoding *enc)
02739 {
02740     long i;
02741     int l;
02742     while (1) {
02743         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02744             p[i] = '\0';
02745         if (i < 0)
02746             return NEIGHBOR_WRAPPED;
02747         ++((unsigned char*)p)[i];
02748         l = rb_enc_precise_mbclen(p, p+len, enc);
02749         if (MBCLEN_CHARFOUND_P(l)) {
02750             l = MBCLEN_CHARFOUND_LEN(l);
02751             if (l == len) {
02752                 return NEIGHBOR_FOUND;
02753             }
02754             else {
02755                 memset(p+l, 0xff, len-l);
02756             }
02757         }
02758         if (MBCLEN_INVALID_P(l) && i < len-1) {
02759             long len2;
02760             int l2;
02761             for (len2 = len-1; 0 < len2; len2--) {
02762                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02763                 if (!MBCLEN_INVALID_P(l2))
02764                     break;
02765             }
02766             memset(p+len2+1, 0xff, len-(len2+1));
02767         }
02768     }
02769 }
02770 
02771 static enum neighbor_char
02772 enc_pred_char(char *p, long len, rb_encoding *enc)
02773 {
02774     long i;
02775     int l;
02776     while (1) {
02777         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02778             p[i] = '\xff';
02779         if (i < 0)
02780             return NEIGHBOR_WRAPPED;
02781         --((unsigned char*)p)[i];
02782         l = rb_enc_precise_mbclen(p, p+len, enc);
02783         if (MBCLEN_CHARFOUND_P(l)) {
02784             l = MBCLEN_CHARFOUND_LEN(l);
02785             if (l == len) {
02786                 return NEIGHBOR_FOUND;
02787             }
02788             else {
02789                 memset(p+l, 0, len-l);
02790             }
02791         }
02792         if (MBCLEN_INVALID_P(l) && i < len-1) {
02793             long len2;
02794             int l2;
02795             for (len2 = len-1; 0 < len2; len2--) {
02796                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02797                 if (!MBCLEN_INVALID_P(l2))
02798                     break;
02799             }
02800             memset(p+len2+1, 0, len-(len2+1));
02801         }
02802     }
02803 }
02804 
02805 /*
02806   overwrite +p+ by succeeding letter in +enc+ and returns
02807   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02808   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02809   assuming each ranges are successive, and mbclen
02810   never change in each ranges.
02811   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02812   character.
02813  */
02814 static enum neighbor_char
02815 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02816 {
02817     enum neighbor_char ret;
02818     unsigned int c;
02819     int ctype;
02820     int range;
02821     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02822 
02823     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02824     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02825         ctype = ONIGENC_CTYPE_DIGIT;
02826     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02827         ctype = ONIGENC_CTYPE_ALPHA;
02828     else
02829         return NEIGHBOR_NOT_CHAR;
02830 
02831     MEMCPY(save, p, char, len);
02832     ret = enc_succ_char(p, len, enc);
02833     if (ret == NEIGHBOR_FOUND) {
02834         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02835         if (rb_enc_isctype(c, ctype, enc))
02836             return NEIGHBOR_FOUND;
02837     }
02838     MEMCPY(p, save, char, len);
02839     range = 1;
02840     while (1) {
02841         MEMCPY(save, p, char, len);
02842         ret = enc_pred_char(p, len, enc);
02843         if (ret == NEIGHBOR_FOUND) {
02844             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02845             if (!rb_enc_isctype(c, ctype, enc)) {
02846                 MEMCPY(p, save, char, len);
02847                 break;
02848             }
02849         }
02850         else {
02851             MEMCPY(p, save, char, len);
02852             break;
02853         }
02854         range++;
02855     }
02856     if (range == 1) {
02857         return NEIGHBOR_NOT_CHAR;
02858     }
02859 
02860     if (ctype != ONIGENC_CTYPE_DIGIT) {
02861         MEMCPY(carry, p, char, len);
02862         return NEIGHBOR_WRAPPED;
02863     }
02864 
02865     MEMCPY(carry, p, char, len);
02866     enc_succ_char(carry, len, enc);
02867     return NEIGHBOR_WRAPPED;
02868 }
02869 
02870 
02871 /*
02872  *  call-seq:
02873  *     str.succ   -> new_str
02874  *     str.next   -> new_str
02875  *
02876  *  Returns the successor to <i>str</i>. The successor is calculated by
02877  *  incrementing characters starting from the rightmost alphanumeric (or
02878  *  the rightmost character if there are no alphanumerics) in the
02879  *  string. Incrementing a digit always results in another digit, and
02880  *  incrementing a letter results in another letter of the same case.
02881  *  Incrementing nonalphanumerics uses the underlying character set's
02882  *  collating sequence.
02883  *
02884  *  If the increment generates a ``carry,'' the character to the left of
02885  *  it is incremented. This process repeats until there is no carry,
02886  *  adding an additional character if necessary.
02887  *
02888  *     "abcd".succ        #=> "abce"
02889  *     "THX1138".succ     #=> "THX1139"
02890  *     "<<koala>>".succ   #=> "<<koalb>>"
02891  *     "1999zzz".succ     #=> "2000aaa"
02892  *     "ZZZ9999".succ     #=> "AAAA0000"
02893  *     "***".succ         #=> "**+"
02894  */
02895 
02896 VALUE
02897 rb_str_succ(VALUE orig)
02898 {
02899     rb_encoding *enc;
02900     VALUE str;
02901     char *sbeg, *s, *e, *last_alnum = 0;
02902     int c = -1;
02903     long l;
02904     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02905     long carry_pos = 0, carry_len = 1;
02906     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02907 
02908     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02909     rb_enc_cr_str_copy_for_substr(str, orig);
02910     OBJ_INFECT(str, orig);
02911     if (RSTRING_LEN(str) == 0) return str;
02912 
02913     enc = STR_ENC_GET(orig);
02914     sbeg = RSTRING_PTR(str);
02915     s = e = sbeg + RSTRING_LEN(str);
02916 
02917     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02918         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02919             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02920                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02921                 s = last_alnum;
02922                 break;
02923             }
02924         }
02925         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02926         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02927         switch (neighbor) {
02928           case NEIGHBOR_NOT_CHAR:
02929             continue;
02930           case NEIGHBOR_FOUND:
02931             return str;
02932           case NEIGHBOR_WRAPPED:
02933             last_alnum = s;
02934             break;
02935         }
02936         c = 1;
02937         carry_pos = s - sbeg;
02938         carry_len = l;
02939     }
02940     if (c == -1) {              /* str contains no alnum */
02941         s = e;
02942         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02943             enum neighbor_char neighbor;
02944             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02945             neighbor = enc_succ_char(s, l, enc);
02946             if (neighbor == NEIGHBOR_FOUND)
02947                 return str;
02948             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02949                 /* wrapped to \0...\0.  search next valid char. */
02950                 enc_succ_char(s, l, enc);
02951             }
02952             if (!rb_enc_asciicompat(enc)) {
02953                 MEMCPY(carry, s, char, l);
02954                 carry_len = l;
02955             }
02956             carry_pos = s - sbeg;
02957         }
02958     }
02959     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02960     s = RSTRING_PTR(str) + carry_pos;
02961     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02962     memmove(s, carry, carry_len);
02963     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02964     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02965     rb_enc_str_coderange(str);
02966     return str;
02967 }
02968 
02969 
02970 /*
02971  *  call-seq:
02972  *     str.succ!   -> str
02973  *     str.next!   -> str
02974  *
02975  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02976  *  place.
02977  */
02978 
02979 static VALUE
02980 rb_str_succ_bang(VALUE str)
02981 {
02982     rb_str_shared_replace(str, rb_str_succ(str));
02983 
02984     return str;
02985 }
02986 
02987 
02988 /*
02989  *  call-seq:
02990  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02991  *     str.upto(other_str, exclusive=false)                -> an_enumerator
02992  *
02993  *  Iterates through successive values, starting at <i>str</i> and
02994  *  ending at <i>other_str</i> inclusive, passing each value in turn to
02995  *  the block. The <code>String#succ</code> method is used to generate
02996  *  each value.  If optional second argument exclusive is omitted or is false,
02997  *  the last value will be included; otherwise it will be excluded.
02998  *
02999  *  If no block is given, an enumerator is returned instead.
03000  *
03001  *     "a8".upto("b6") {|s| print s, ' ' }
03002  *     for s in "a8".."b6"
03003  *       print s, ' '
03004  *     end
03005  *
03006  *  <em>produces:</em>
03007  *
03008  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03009  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03010  *
03011  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
03012  *  both are recognized as decimal numbers. In addition, the width of
03013  *  string (e.g. leading zeros) is handled appropriately.
03014  *
03015  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
03016  *     "25".upto("5").to_a   #=> []
03017  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
03018  */
03019 
03020 static VALUE
03021 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03022 {
03023     VALUE end, exclusive;
03024     VALUE current, after_end;
03025     ID succ;
03026     int n, excl, ascii;
03027     rb_encoding *enc;
03028 
03029     rb_scan_args(argc, argv, "11", &end, &exclusive);
03030     RETURN_ENUMERATOR(beg, argc, argv);
03031     excl = RTEST(exclusive);
03032     CONST_ID(succ, "succ");
03033     StringValue(end);
03034     enc = rb_enc_check(beg, end);
03035     ascii = (is_ascii_string(beg) && is_ascii_string(end));
03036     /* single character */
03037     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03038         char c = RSTRING_PTR(beg)[0];
03039         char e = RSTRING_PTR(end)[0];
03040 
03041         if (c > e || (excl && c == e)) return beg;
03042         for (;;) {
03043             rb_yield(rb_enc_str_new(&c, 1, enc));
03044             if (!excl && c == e) break;
03045             c++;
03046             if (excl && c == e) break;
03047         }
03048         return beg;
03049     }
03050     /* both edges are all digits */
03051     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03052         char *s, *send;
03053         VALUE b, e;
03054         int width;
03055 
03056         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03057         width = rb_long2int(send - s);
03058         while (s < send) {
03059             if (!ISDIGIT(*s)) goto no_digits;
03060             s++;
03061         }
03062         s = RSTRING_PTR(end); send = RSTRING_END(end);
03063         while (s < send) {
03064             if (!ISDIGIT(*s)) goto no_digits;
03065             s++;
03066         }
03067         b = rb_str_to_inum(beg, 10, FALSE);
03068         e = rb_str_to_inum(end, 10, FALSE);
03069         if (FIXNUM_P(b) && FIXNUM_P(e)) {
03070             long bi = FIX2LONG(b);
03071             long ei = FIX2LONG(e);
03072             rb_encoding *usascii = rb_usascii_encoding();
03073 
03074             while (bi <= ei) {
03075                 if (excl && bi == ei) break;
03076                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03077                 bi++;
03078             }
03079         }
03080         else {
03081             ID op = excl ? '<' : rb_intern("<=");
03082             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03083 
03084             args[0] = INT2FIX(width);
03085             while (rb_funcall(b, op, 1, e)) {
03086                 args[1] = b;
03087                 rb_yield(rb_str_format(numberof(args), args, fmt));
03088                 b = rb_funcall(b, succ, 0, 0);
03089             }
03090         }
03091         return beg;
03092     }
03093     /* normal case */
03094   no_digits:
03095     n = rb_str_cmp(beg, end);
03096     if (n > 0 || (excl && n == 0)) return beg;
03097 
03098     after_end = rb_funcall(end, succ, 0, 0);
03099     current = rb_str_dup(beg);
03100     while (!rb_str_equal(current, after_end)) {
03101         VALUE next = Qnil;
03102         if (excl || !rb_str_equal(current, end))
03103             next = rb_funcall(current, succ, 0, 0);
03104         rb_yield(current);
03105         if (NIL_P(next)) break;
03106         current = next;
03107         StringValue(current);
03108         if (excl && rb_str_equal(current, end)) break;
03109         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03110             break;
03111     }
03112 
03113     return beg;
03114 }
03115 
03116 static VALUE
03117 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03118 {
03119     if (rb_reg_search(re, str, 0, 0) >= 0) {
03120         VALUE match = rb_backref_get();
03121         int nth = rb_reg_backref_number(match, backref);
03122         return rb_reg_nth_match(nth, match);
03123     }
03124     return Qnil;
03125 }
03126 
03127 static VALUE
03128 rb_str_aref(VALUE str, VALUE indx)
03129 {
03130     long idx;
03131 
03132     switch (TYPE(indx)) {
03133       case T_FIXNUM:
03134         idx = FIX2LONG(indx);
03135 
03136       num_index:
03137         str = rb_str_substr(str, idx, 1);
03138         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03139         return str;
03140 
03141       case T_REGEXP:
03142         return rb_str_subpat(str, indx, INT2FIX(0));
03143 
03144       case T_STRING:
03145         if (rb_str_index(str, indx, 0) != -1)
03146             return rb_str_dup(indx);
03147         return Qnil;
03148 
03149       default:
03150         /* check if indx is Range */
03151         {
03152             long beg, len;
03153             VALUE tmp;
03154 
03155             len = str_strlen(str, STR_ENC_GET(str));
03156             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03157               case Qfalse:
03158                 break;
03159               case Qnil:
03160                 return Qnil;
03161               default:
03162                 tmp = rb_str_substr(str, beg, len);
03163                 return tmp;
03164             }
03165         }
03166         idx = NUM2LONG(indx);
03167         goto num_index;
03168     }
03169     return Qnil;                /* not reached */
03170 }
03171 
03172 
03173 /*
03174  *  call-seq:
03175  *     str[fixnum]                 -> new_str or nil
03176  *     str[fixnum, fixnum]         -> new_str or nil
03177  *     str[range]                  -> new_str or nil
03178  *     str[regexp]                 -> new_str or nil
03179  *     str[regexp, fixnum]         -> new_str or nil
03180  *     str[other_str]              -> new_str or nil
03181  *     str.slice(fixnum)           -> new_str or nil
03182  *     str.slice(fixnum, fixnum)   -> new_str or nil
03183  *     str.slice(range)            -> new_str or nil
03184  *     str.slice(regexp)           -> new_str or nil
03185  *     str.slice(regexp, fixnum)   -> new_str or nil
03186  *     str.slice(regexp, capname)  -> new_str or nil
03187  *     str.slice(other_str)        -> new_str or nil
03188  *
03189  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03190  *  substring of one character at that position. If passed two <code>Fixnum</code>
03191  *  objects, returns a substring starting at the offset given by the first, and
03192  *  with a length given by the second. If passed a range, its beginning and end
03193  *  are interpreted as offsets delimiting the substring to be returned. In all
03194  *  three cases, if an offset is negative, it is counted from the end of <i>str</i>.
03195  *  Returns <code>nil</code> if the initial offset falls outside the string or
03196  *  the length is negative.
03197  *
03198  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03199  *  returned. If a numeric or name parameter follows the regular expression, that
03200  *  component of the <code>MatchData</code> is returned instead. If a
03201  *  <code>String</code> is given, that string is returned if it occurs in
03202  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03203  *  match.
03204  *
03205  *     a = "hello there"
03206  *     a[1]                   #=> "e"
03207  *     a[2, 3]                #=> "llo"
03208  *     a[2..3]                #=> "ll"
03209  *     a[-3, 2]               #=> "er"
03210  *     a[7..-2]               #=> "her"
03211  *     a[-4..-2]              #=> "her"
03212  *     a[-2..-4]              #=> ""
03213  *     a[12..-1]              #=> nil
03214  *     a[/[aeiou](.)\1/]      #=> "ell"
03215  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03216  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03217  *     a[/[aeiou](.)\1/, 2]   #=> nil
03218  *     a["lo"]                #=> "lo"
03219  *     a["bye"]               #=> nil
03220  */
03221 
03222 static VALUE
03223 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03224 {
03225     if (argc == 2) {
03226         if (TYPE(argv[0]) == T_REGEXP) {
03227             return rb_str_subpat(str, argv[0], argv[1]);
03228         }
03229         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03230     }
03231     if (argc != 1) {
03232         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03233     }
03234     return rb_str_aref(str, argv[0]);
03235 }
03236 
03237 VALUE
03238 rb_str_drop_bytes(VALUE str, long len)
03239 {
03240     char *ptr = RSTRING_PTR(str);
03241     long olen = RSTRING_LEN(str), nlen;
03242 
03243     str_modifiable(str);
03244     if (len > olen) len = olen;
03245     nlen = olen - len;
03246     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03247         char *oldptr = ptr;
03248         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03249         STR_SET_EMBED(str);
03250         STR_SET_EMBED_LEN(str, nlen);
03251         ptr = RSTRING(str)->as.ary;
03252         memmove(ptr, oldptr + len, nlen);
03253         if (fl == STR_NOEMBED) xfree(oldptr);
03254     }
03255     else {
03256         if (!STR_SHARED_P(str)) rb_str_new4(str);
03257         ptr = RSTRING(str)->as.heap.ptr += len;
03258         RSTRING(str)->as.heap.len = nlen;
03259     }
03260     ptr[nlen] = 0;
03261     ENC_CODERANGE_CLEAR(str);
03262     return str;
03263 }
03264 
03265 static void
03266 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03267 {
03268     if (beg == 0 && RSTRING_LEN(val) == 0) {
03269         rb_str_drop_bytes(str, len);
03270         OBJ_INFECT(str, val);
03271         return;
03272     }
03273 
03274     rb_str_modify(str);
03275     if (len < RSTRING_LEN(val)) {
03276         /* expand string */
03277         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03278     }
03279 
03280     if (RSTRING_LEN(val) != len) {
03281         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03282                 RSTRING_PTR(str) + beg + len,
03283                 RSTRING_LEN(str) - (beg + len));
03284     }
03285     if (RSTRING_LEN(val) < beg && len < 0) {
03286         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03287     }
03288     if (RSTRING_LEN(val) > 0) {
03289         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03290     }
03291     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03292     if (RSTRING_PTR(str)) {
03293         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03294     }
03295     OBJ_INFECT(str, val);
03296 }
03297 
03298 static void
03299 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03300 {
03301     long slen;
03302     char *p, *e;
03303     rb_encoding *enc;
03304     int singlebyte = single_byte_optimizable(str);
03305     int cr;
03306 
03307     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03308 
03309     StringValue(val);
03310     enc = rb_enc_check(str, val);
03311     slen = str_strlen(str, enc);
03312 
03313     if (slen < beg) {
03314       out_of_range:
03315         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03316     }
03317     if (beg < 0) {
03318         if (-beg > slen) {
03319             goto out_of_range;
03320         }
03321         beg += slen;
03322     }
03323     if (slen < len || slen < beg + len) {
03324         len = slen - beg;
03325     }
03326     str_modify_keep_cr(str);
03327     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03328     if (!p) p = RSTRING_END(str);
03329     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03330     if (!e) e = RSTRING_END(str);
03331     /* error check */
03332     beg = p - RSTRING_PTR(str); /* physical position */
03333     len = e - p;                /* physical length */
03334     rb_str_splice_0(str, beg, len, val);
03335     rb_enc_associate(str, enc);
03336     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03337     if (cr != ENC_CODERANGE_BROKEN)
03338         ENC_CODERANGE_SET(str, cr);
03339 }
03340 
03341 void
03342 rb_str_update(VALUE str, long beg, long len, VALUE val)
03343 {
03344     rb_str_splice(str, beg, len, val);
03345 }
03346 
03347 static void
03348 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03349 {
03350     int nth;
03351     VALUE match;
03352     long start, end, len;
03353     rb_encoding *enc;
03354     struct re_registers *regs;
03355 
03356     if (rb_reg_search(re, str, 0, 0) < 0) {
03357         rb_raise(rb_eIndexError, "regexp not matched");
03358     }
03359     match = rb_backref_get();
03360     nth = rb_reg_backref_number(match, backref);
03361     regs = RMATCH_REGS(match);
03362     if (nth >= regs->num_regs) {
03363       out_of_range:
03364         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03365     }
03366     if (nth < 0) {
03367         if (-nth >= regs->num_regs) {
03368             goto out_of_range;
03369         }
03370         nth += regs->num_regs;
03371     }
03372 
03373     start = BEG(nth);
03374     if (start == -1) {
03375         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03376     }
03377     end = END(nth);
03378     len = end - start;
03379     StringValue(val);
03380     enc = rb_enc_check(str, val);
03381     rb_str_splice_0(str, start, len, val);
03382     rb_enc_associate(str, enc);
03383 }
03384 
03385 static VALUE
03386 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03387 {
03388     long idx, beg;
03389 
03390     switch (TYPE(indx)) {
03391       case T_FIXNUM:
03392         idx = FIX2LONG(indx);
03393       num_index:
03394         rb_str_splice(str, idx, 1, val);
03395         return val;
03396 
03397       case T_REGEXP:
03398         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03399         return val;
03400 
03401       case T_STRING:
03402         beg = rb_str_index(str, indx, 0);
03403         if (beg < 0) {
03404             rb_raise(rb_eIndexError, "string not matched");
03405         }
03406         beg = rb_str_sublen(str, beg);
03407         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03408         return val;
03409 
03410       default:
03411         /* check if indx is Range */
03412         {
03413             long beg, len;
03414             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03415                 rb_str_splice(str, beg, len, val);
03416                 return val;
03417             }
03418         }
03419         idx = NUM2LONG(indx);
03420         goto num_index;
03421     }
03422 }
03423 
03424 /*
03425  *  call-seq:
03426  *     str[fixnum] = new_str
03427  *     str[fixnum, fixnum] = new_str
03428  *     str[range] = aString
03429  *     str[regexp] = new_str
03430  *     str[regexp, fixnum] = new_str
03431  *     str[regexp, name] = new_str
03432  *     str[other_str] = new_str
03433  *
03434  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03435  *  portion of the string affected is determined using the same criteria as
03436  *  <code>String#[]</code>. If the replacement string is not the same length as
03437  *  the text it is replacing, the string will be adjusted accordingly. If the
03438  *  regular expression or string is used as the index doesn't match a position
03439  *  in the string, <code>IndexError</code> is raised. If the regular expression
03440  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03441  *  which portion of the match to replace (effectively using the
03442  *  <code>MatchData</code> indexing rules. The forms that take a
03443  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03444  *  out of range; the <code>Range</code> form will raise a
03445  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03446  *  forms will silently ignore the assignment.
03447  */
03448 
03449 static VALUE
03450 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03451 {
03452     if (argc == 3) {
03453         if (TYPE(argv[0]) == T_REGEXP) {
03454             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03455         }
03456         else {
03457             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03458         }
03459         return argv[2];
03460     }
03461     if (argc != 2) {
03462         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03463     }
03464     return rb_str_aset(str, argv[0], argv[1]);
03465 }
03466 
03467 /*
03468  *  call-seq:
03469  *     str.insert(index, other_str)   -> str
03470  *
03471  *  Inserts <i>other_str</i> before the character at the given
03472  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03473  *  end of the string, and insert <em>after</em> the given character.
03474  *  The intent is insert <i>aString</i> so that it starts at the given
03475  *  <i>index</i>.
03476  *
03477  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03478  *     "abcd".insert(3, 'X')    #=> "abcXd"
03479  *     "abcd".insert(4, 'X')    #=> "abcdX"
03480  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03481  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03482  */
03483 
03484 static VALUE
03485 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03486 {
03487     long pos = NUM2LONG(idx);
03488 
03489     if (pos == -1) {
03490         return rb_str_append(str, str2);
03491     }
03492     else if (pos < 0) {
03493         pos++;
03494     }
03495     rb_str_splice(str, pos, 0, str2);
03496     return str;
03497 }
03498 
03499 
03500 /*
03501  *  call-seq:
03502  *     str.slice!(fixnum)           -> fixnum or nil
03503  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03504  *     str.slice!(range)            -> new_str or nil
03505  *     str.slice!(regexp)           -> new_str or nil
03506  *     str.slice!(other_str)        -> new_str or nil
03507  *
03508  *  Deletes the specified portion from <i>str</i>, and returns the portion
03509  *  deleted.
03510  *
03511  *     string = "this is a string"
03512  *     string.slice!(2)        #=> "i"
03513  *     string.slice!(3..6)     #=> " is "
03514  *     string.slice!(/s.*t/)   #=> "sa st"
03515  *     string.slice!("r")      #=> "r"
03516  *     string                  #=> "thing"
03517  */
03518 
03519 static VALUE
03520 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03521 {
03522     VALUE result;
03523     VALUE buf[3];
03524     int i;
03525 
03526     if (argc < 1 || 2 < argc) {
03527         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03528     }
03529     for (i=0; i<argc; i++) {
03530         buf[i] = argv[i];
03531     }
03532     str_modify_keep_cr(str);
03533     result = rb_str_aref_m(argc, buf, str);
03534     if (!NIL_P(result)) {
03535         buf[i] = rb_str_new(0,0);
03536         rb_str_aset_m(argc+1, buf, str);
03537     }
03538     return result;
03539 }
03540 
03541 static VALUE
03542 get_pat(VALUE pat, int quote)
03543 {
03544     VALUE val;
03545 
03546     switch (TYPE(pat)) {
03547       case T_REGEXP:
03548         return pat;
03549 
03550       case T_STRING:
03551         break;
03552 
03553       default:
03554         val = rb_check_string_type(pat);
03555         if (NIL_P(val)) {
03556             Check_Type(pat, T_REGEXP);
03557         }
03558         pat = val;
03559     }
03560 
03561     if (quote) {
03562         pat = rb_reg_quote(pat);
03563     }
03564 
03565     return rb_reg_regcomp(pat);
03566 }
03567 
03568 
03569 /*
03570  *  call-seq:
03571  *     str.sub!(pattern, replacement)          -> str or nil
03572  *     str.sub!(pattern) {|match| block }      -> str or nil
03573  *
03574  *  Performs the substitutions of <code>String#sub</code> in place,
03575  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03576  *  performed.
03577  */
03578 
03579 static VALUE
03580 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03581 {
03582     VALUE pat, repl, hash = Qnil;
03583     int iter = 0;
03584     int tainted = 0;
03585     int untrusted = 0;
03586     long plen;
03587 
03588     if (argc == 1 && rb_block_given_p()) {
03589         iter = 1;
03590     }
03591     else if (argc == 2) {
03592         repl = argv[1];
03593         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03594         if (NIL_P(hash)) {
03595             StringValue(repl);
03596         }
03597         if (OBJ_TAINTED(repl)) tainted = 1;
03598         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03599     }
03600     else {
03601         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03602     }
03603 
03604     pat = get_pat(argv[0], 1);
03605     str_modifiable(str);
03606     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03607         rb_encoding *enc;
03608         int cr = ENC_CODERANGE(str);
03609         VALUE match = rb_backref_get();
03610         struct re_registers *regs = RMATCH_REGS(match);
03611         long beg0 = BEG(0);
03612         long end0 = END(0);
03613         char *p, *rp;
03614         long len, rlen;
03615 
03616         if (iter || !NIL_P(hash)) {
03617             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03618 
03619             if (iter) {
03620                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03621             }
03622             else {
03623                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03624                 repl = rb_obj_as_string(repl);
03625             }
03626             str_mod_check(str, p, len);
03627             rb_check_frozen(str);
03628         }
03629         else {
03630             repl = rb_reg_regsub(repl, str, regs, pat);
03631         }
03632         enc = rb_enc_compatible(str, repl);
03633         if (!enc) {
03634             rb_encoding *str_enc = STR_ENC_GET(str);
03635             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03636             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03637                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03638                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03639                          rb_enc_name(str_enc),
03640                          rb_enc_name(STR_ENC_GET(repl)));
03641             }
03642             enc = STR_ENC_GET(repl);
03643         }
03644         rb_str_modify(str);
03645         rb_enc_associate(str, enc);
03646         if (OBJ_TAINTED(repl)) tainted = 1;
03647         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03648         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03649             int cr2 = ENC_CODERANGE(repl);
03650             if (cr2 == ENC_CODERANGE_BROKEN ||
03651                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03652                 cr = ENC_CODERANGE_UNKNOWN;
03653             else
03654                 cr = cr2;
03655         }
03656         plen = end0 - beg0;
03657         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03658         len = RSTRING_LEN(str);
03659         if (rlen > plen) {
03660             RESIZE_CAPA(str, len + rlen - plen);
03661         }
03662         p = RSTRING_PTR(str);
03663         if (rlen != plen) {
03664             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03665         }
03666         memcpy(p + beg0, rp, rlen);
03667         len += rlen - plen;
03668         STR_SET_LEN(str, len);
03669         RSTRING_PTR(str)[len] = '\0';
03670         ENC_CODERANGE_SET(str, cr);
03671         if (tainted) OBJ_TAINT(str);
03672         if (untrusted) OBJ_UNTRUST(str);
03673 
03674         return str;
03675     }
03676     return Qnil;
03677 }
03678 
03679 
03680 /*
03681  *  call-seq:
03682  *     str.sub(pattern, replacement)         -> new_str
03683  *     str.sub(pattern, hash)                -> new_str
03684  *     str.sub(pattern) {|match| block }     -> new_str
03685  *
03686  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03687  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03688  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03689  *  regular expression metacharacters it contains will be interpreted
03690  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03691  *  instead of a digit.
03692  *
03693  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03694  *  the matched text. It may contain back-references to the pattern's capture
03695  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03696  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03697  *  double-quoted string, both back-references must be preceded by an
03698  *  additional backslash. However, within <i>replacement</i> the special match
03699  *  variables, such as <code>&$</code>, will not refer to the current match.
03700  *
03701  *  If the second argument is a <code>Hash</code>, and the matched text is one
03702  *  of its keys, the corresponding value is the replacement string.
03703  *
03704  *  In the block form, the current match string is passed in as a parameter,
03705  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03706  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03707  *  returned by the block will be substituted for the match on each call.
03708  *
03709  *  The result inherits any tainting in the original string or any supplied
03710  *  replacement string.
03711  *
03712  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03713  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03714  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03715  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03716  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03717  *      #=> "Is /bin/bash your preferred shell?"
03718  */
03719 
03720 static VALUE
03721 rb_str_sub(int argc, VALUE *argv, VALUE str)
03722 {
03723     str = rb_str_dup(str);
03724     rb_str_sub_bang(argc, argv, str);
03725     return str;
03726 }
03727 
03728 static VALUE
03729 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03730 {
03731     VALUE pat, val, repl, match, dest, hash = Qnil;
03732     struct re_registers *regs;
03733     long beg, n;
03734     long beg0, end0;
03735     long offset, blen, slen, len, last;
03736     int iter = 0;
03737     char *sp, *cp;
03738     int tainted = 0;
03739     rb_encoding *str_enc;
03740 
03741     switch (argc) {
03742       case 1:
03743         RETURN_ENUMERATOR(str, argc, argv);
03744         iter = 1;
03745         break;
03746       case 2:
03747         repl = argv[1];
03748         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03749         if (NIL_P(hash)) {
03750             StringValue(repl);
03751         }
03752         if (OBJ_TAINTED(repl)) tainted = 1;
03753         break;
03754       default:
03755         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03756     }
03757 
03758     pat = get_pat(argv[0], 1);
03759     beg = rb_reg_search(pat, str, 0, 0);
03760     if (beg < 0) {
03761         if (bang) return Qnil;  /* no match, no substitution */
03762         return rb_str_dup(str);
03763     }
03764 
03765     offset = 0;
03766     n = 0;
03767     blen = RSTRING_LEN(str) + 30; /* len + margin */
03768     dest = rb_str_buf_new(blen);
03769     sp = RSTRING_PTR(str);
03770     slen = RSTRING_LEN(str);
03771     cp = sp;
03772     str_enc = STR_ENC_GET(str);
03773     rb_enc_associate(dest, str_enc);
03774     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03775 
03776     do {
03777         n++;
03778         match = rb_backref_get();
03779         regs = RMATCH_REGS(match);
03780         beg0 = BEG(0);
03781         end0 = END(0);
03782         if (iter || !NIL_P(hash)) {
03783             if (iter) {
03784                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03785             }
03786             else {
03787                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03788                 val = rb_obj_as_string(val);
03789             }
03790             str_mod_check(str, sp, slen);
03791             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03792                 rb_raise(rb_eRuntimeError, "block should not cheat");
03793             }
03794         }
03795         else {
03796             val = rb_reg_regsub(repl, str, regs, pat);
03797         }
03798 
03799         if (OBJ_TAINTED(val)) tainted = 1;
03800 
03801         len = beg - offset;     /* copy pre-match substr */
03802         if (len) {
03803             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03804         }
03805 
03806         rb_str_buf_append(dest, val);
03807 
03808         last = offset;
03809         offset = end0;
03810         if (beg0 == end0) {
03811             /*
03812              * Always consume at least one character of the input string
03813              * in order to prevent infinite loops.
03814              */
03815             if (RSTRING_LEN(str) <= end0) break;
03816             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03817             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03818             offset = end0 + len;
03819         }
03820         cp = RSTRING_PTR(str) + offset;
03821         if (offset > RSTRING_LEN(str)) break;
03822         beg = rb_reg_search(pat, str, offset, 0);
03823     } while (beg >= 0);
03824     if (RSTRING_LEN(str) > offset) {
03825         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03826     }
03827     rb_reg_search(pat, str, last, 0);
03828     if (bang) {
03829         rb_str_shared_replace(str, dest);
03830     }
03831     else {
03832         RBASIC(dest)->klass = rb_obj_class(str);
03833         OBJ_INFECT(dest, str);
03834         str = dest;
03835     }
03836 
03837     if (tainted) OBJ_TAINT(str);
03838     return str;
03839 }
03840 
03841 
03842 /*
03843  *  call-seq:
03844  *     str.gsub!(pattern, replacement)        -> str or nil
03845  *     str.gsub!(pattern) {|match| block }    -> str or nil
03846  *     str.gsub!(pattern)                     -> an_enumerator
03847  *
03848  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03849  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03850  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03851  */
03852 
03853 static VALUE
03854 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03855 {
03856     str_modify_keep_cr(str);
03857     return str_gsub(argc, argv, str, 1);
03858 }
03859 
03860 
03861 /*
03862  *  call-seq:
03863  *     str.gsub(pattern, replacement)       -> new_str
03864  *     str.gsub(pattern, hash)              -> new_str
03865  *     str.gsub(pattern) {|match| block }   -> new_str
03866  *     str.gsub(pattern)                    -> enumerator
03867  *
03868  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03869  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03870  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03871  *  regular expression metacharacters it contains will be interpreted
03872  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03873  *  instead of a digit.
03874  *
03875  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03876  *  the matched text. It may contain back-references to the pattern's capture
03877  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03878  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03879  *  double-quoted string, both back-references must be preceded by an
03880  *  additional backslash. However, within <i>replacement</i> the special match
03881  *  variables, such as <code>&$</code>, will not refer to the current match.
03882  *
03883  *  If the second argument is a <code>Hash</code>, and the matched text is one
03884  *  of its keys, the corresponding value is the replacement string.
03885  *
03886  *  In the block form, the current match string is passed in as a parameter,
03887  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03888  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03889  *  returned by the block will be substituted for the match on each call.
03890  *
03891  *  The result inherits any tainting in the original string or any supplied
03892  *  replacement string.
03893  *
03894  *  When neither a block nor a second argument is supplied, an
03895  *  <code>Enumerator</code> is returned.
03896  *
03897  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03898  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03899  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03900  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03901  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03902  */
03903 
03904 static VALUE
03905 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03906 {
03907     return str_gsub(argc, argv, str, 0);
03908 }
03909 
03910 
03911 /*
03912  *  call-seq:
03913  *     str.replace(other_str)   -> str
03914  *
03915  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03916  *  values in <i>other_str</i>.
03917  *
03918  *     s = "hello"         #=> "hello"
03919  *     s.replace "world"   #=> "world"
03920  */
03921 
03922 VALUE
03923 rb_str_replace(VALUE str, VALUE str2)
03924 {
03925     str_modifiable(str);
03926     if (str == str2) return str;
03927 
03928     StringValue(str2);
03929     str_discard(str);
03930     return str_replace(str, str2);
03931 }
03932 
03933 /*
03934  *  call-seq:
03935  *     string.clear    ->  string
03936  *
03937  *  Makes string empty.
03938  *
03939  *     a = "abcde"
03940  *     a.clear    #=> ""
03941  */
03942 
03943 static VALUE
03944 rb_str_clear(VALUE str)
03945 {
03946     str_discard(str);
03947     STR_SET_EMBED(str);
03948     STR_SET_EMBED_LEN(str, 0);
03949     RSTRING_PTR(str)[0] = 0;
03950     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03951         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03952     else
03953         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03954     return str;
03955 }
03956 
03957 /*
03958  *  call-seq:
03959  *     string.chr    ->  string
03960  *
03961  *  Returns a one-character string at the beginning of the string.
03962  *
03963  *     a = "abcde"
03964  *     a.chr    #=> "a"
03965  */
03966 
03967 static VALUE
03968 rb_str_chr(VALUE str)
03969 {
03970     return rb_str_substr(str, 0, 1);
03971 }
03972 
03973 /*
03974  *  call-seq:
03975  *     str.getbyte(index)          -> 0 .. 255
03976  *
03977  *  returns the <i>index</i>th byte as an integer.
03978  */
03979 static VALUE
03980 rb_str_getbyte(VALUE str, VALUE index)
03981 {
03982     long pos = NUM2LONG(index);
03983 
03984     if (pos < 0)
03985         pos += RSTRING_LEN(str);
03986     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03987         return Qnil;
03988 
03989     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03990 }
03991 
03992 /*
03993  *  call-seq:
03994  *     str.setbyte(index, int) -> int
03995  *
03996  *  modifies the <i>index</i>th byte as <i>int</i>.
03997  */
03998 static VALUE
03999 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04000 {
04001     long pos = NUM2LONG(index);
04002     int byte = NUM2INT(value);
04003 
04004     rb_str_modify(str);
04005 
04006     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04007         rb_raise(rb_eIndexError, "index %ld out of string", pos);
04008     if (pos < 0)
04009         pos += RSTRING_LEN(str);
04010 
04011     RSTRING_PTR(str)[pos] = byte;
04012 
04013     return value;
04014 }
04015 
04016 static VALUE
04017 str_byte_substr(VALUE str, long beg, long len)
04018 {
04019     char *p, *s = RSTRING_PTR(str);
04020     long n = RSTRING_LEN(str);
04021     VALUE str2;
04022 
04023     if (beg > n || len < 0) return Qnil;
04024     if (beg < 0) {
04025         beg += n;
04026         if (beg < 0) return Qnil;
04027     }
04028     if (beg + len > n)
04029         len = n - beg;
04030     if (len <= 0) {
04031         len = 0;
04032         p = 0;
04033     }
04034     else
04035         p = s + beg;
04036 
04037     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04038         str2 = rb_str_new4(str);
04039         str2 = str_new3(rb_obj_class(str2), str2);
04040         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04041         RSTRING(str2)->as.heap.len = len;
04042     }
04043     else {
04044         str2 = rb_str_new5(str, p, len);
04045     }
04046 
04047     str_enc_copy(str2, str);
04048 
04049     if (RSTRING_LEN(str2) == 0) {
04050         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04051             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04052         else
04053             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04054     }
04055     else {
04056         switch (ENC_CODERANGE(str)) {
04057           case ENC_CODERANGE_7BIT:
04058             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04059             break;
04060           default:
04061             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04062             break;
04063         }
04064     }
04065 
04066     OBJ_INFECT(str2, str);
04067 
04068     return str2;
04069 }
04070 
04071 static VALUE
04072 str_byte_aref(VALUE str, VALUE indx)
04073 {
04074     long idx;
04075     switch (TYPE(indx)) {
04076       case T_FIXNUM:
04077         idx = FIX2LONG(indx);
04078 
04079       num_index:
04080         str = str_byte_substr(str, idx, 1);
04081         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04082         return str;
04083 
04084       default:
04085         /* check if indx is Range */
04086         {
04087             long beg, len = RSTRING_LEN(str);
04088 
04089             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04090               case Qfalse:
04091                 break;
04092               case Qnil:
04093                 return Qnil;
04094               default:
04095                 return str_byte_substr(str, beg, len);
04096             }
04097         }
04098         idx = NUM2LONG(indx);
04099         goto num_index;
04100     }
04101     return Qnil;                /* not reached */
04102 }
04103 
04104 /*
04105  *  call-seq:
04106  *     str.byteslice(fixnum)           -> new_str or nil
04107  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
04108  *     str.byteslice(range)            -> new_str or nil
04109  *
04110  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
04111  *  substring of one byte at that position. If passed two <code>Fixnum</code>
04112  *  objects, returns a substring starting at the offset given by the first, and
04113  *  a length given by the second. If given a <code>Range</code>, a substring containing
04114  *  bytes at offsets given by the range is returned. In all three cases, if
04115  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
04116  *  <code>nil</code> if the initial offset falls outside the string, the length
04117  *  is negative, or the beginning of the range is greater than the end.
04118  *  The encoding of the resulted string keeps original encoding.
04119  *
04120  *     "hello".byteslice(1)     #=> "e"
04121  *     "hello".byteslice(-1)    #=> "o"
04122  *     "hello".byteslice(1, 2)  #=> "el"
04123  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
04124  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
04125  */
04126 
04127 static VALUE
04128 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04129 {
04130     if (argc == 2) {
04131         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04132     }
04133     if (argc != 1) {
04134         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04135     }
04136     return str_byte_aref(str, argv[0]);
04137 }
04138 
04139 /*
04140  *  call-seq:
04141  *     str.reverse   -> new_str
04142  *
04143  *  Returns a new string with the characters from <i>str</i> in reverse order.
04144  *
04145  *     "stressed".reverse   #=> "desserts"
04146  */
04147 
04148 static VALUE
04149 rb_str_reverse(VALUE str)
04150 {
04151     rb_encoding *enc;
04152     VALUE rev;
04153     char *s, *e, *p;
04154     int single = 1;
04155 
04156     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04157     enc = STR_ENC_GET(str);
04158     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04159     s = RSTRING_PTR(str); e = RSTRING_END(str);
04160     p = RSTRING_END(rev);
04161 
04162     if (RSTRING_LEN(str) > 1) {
04163         if (single_byte_optimizable(str)) {
04164             while (s < e) {
04165                 *--p = *s++;
04166             }
04167         }
04168         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04169             while (s < e) {
04170                 int clen = rb_enc_fast_mbclen(s, e, enc);
04171 
04172                 if (clen > 1 || (*s & 0x80)) single = 0;
04173                 p -= clen;
04174                 memcpy(p, s, clen);
04175                 s += clen;
04176             }
04177         }
04178         else {
04179             while (s < e) {
04180                 int clen = rb_enc_mbclen(s, e, enc);
04181 
04182                 if (clen > 1 || (*s & 0x80)) single = 0;
04183                 p -= clen;
04184                 memcpy(p, s, clen);
04185                 s += clen;
04186             }
04187         }
04188     }
04189     STR_SET_LEN(rev, RSTRING_LEN(str));
04190     OBJ_INFECT(rev, str);
04191     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04192         if (single) {
04193             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04194         }
04195         else {
04196             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04197         }
04198     }
04199     rb_enc_cr_str_copy_for_substr(rev, str);
04200 
04201     return rev;
04202 }
04203 
04204 
04205 /*
04206  *  call-seq:
04207  *     str.reverse!   -> str
04208  *
04209  *  Reverses <i>str</i> in place.
04210  */
04211 
04212 static VALUE
04213 rb_str_reverse_bang(VALUE str)
04214 {
04215     if (RSTRING_LEN(str) > 1) {
04216         if (single_byte_optimizable(str)) {
04217             char *s, *e, c;
04218 
04219             str_modify_keep_cr(str);
04220             s = RSTRING_PTR(str);
04221             e = RSTRING_END(str) - 1;
04222             while (s < e) {
04223                 c = *s;
04224                 *s++ = *e;
04225                 *e-- = c;
04226             }
04227         }
04228         else {
04229             rb_str_shared_replace(str, rb_str_reverse(str));
04230         }
04231     }
04232     else {
04233         str_modify_keep_cr(str);
04234     }
04235     return str;
04236 }
04237 
04238 
04239 /*
04240  *  call-seq:
04241  *     str.include? other_str   -> true or false
04242  *
04243  *  Returns <code>true</code> if <i>str</i> contains the given string or
04244  *  character.
04245  *
04246  *     "hello".include? "lo"   #=> true
04247  *     "hello".include? "ol"   #=> false
04248  *     "hello".include? ?h     #=> true
04249  */
04250 
04251 static VALUE
04252 rb_str_include(VALUE str, VALUE arg)
04253 {
04254     long i;
04255 
04256     StringValue(arg);
04257     i = rb_str_index(str, arg, 0);
04258 
04259     if (i == -1) return Qfalse;
04260     return Qtrue;
04261 }
04262 
04263 
04264 /*
04265  *  call-seq:
04266  *     str.to_i(base=10)   -> integer
04267  *
04268  *  Returns the result of interpreting leading characters in <i>str</i> as an
04269  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04270  *  end of a valid number are ignored. If there is not a valid number at the
04271  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04272  *  exception when <i>base</i> is valid.
04273  *
04274  *     "12345".to_i             #=> 12345
04275  *     "99 red balloons".to_i   #=> 99
04276  *     "0a".to_i                #=> 0
04277  *     "0a".to_i(16)            #=> 10
04278  *     "hello".to_i             #=> 0
04279  *     "1100101".to_i(2)        #=> 101
04280  *     "1100101".to_i(8)        #=> 294977
04281  *     "1100101".to_i(10)       #=> 1100101
04282  *     "1100101".to_i(16)       #=> 17826049
04283  */
04284 
04285 static VALUE
04286 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04287 {
04288     int base;
04289 
04290     if (argc == 0) base = 10;
04291     else {
04292         VALUE b;
04293 
04294         rb_scan_args(argc, argv, "01", &b);
04295         base = NUM2INT(b);
04296     }
04297     if (base < 0) {
04298         rb_raise(rb_eArgError, "invalid radix %d", base);
04299     }
04300     return rb_str_to_inum(str, base, FALSE);
04301 }
04302 
04303 
04304 /*
04305  *  call-seq:
04306  *     str.to_f   -> float
04307  *
04308  *  Returns the result of interpreting leading characters in <i>str</i> as a
04309  *  floating point number. Extraneous characters past the end of a valid number
04310  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04311  *  <code>0.0</code> is returned. This method never raises an exception.
04312  *
04313  *     "123.45e1".to_f        #=> 1234.5
04314  *     "45.67 degrees".to_f   #=> 45.67
04315  *     "thx1138".to_f         #=> 0.0
04316  */
04317 
04318 static VALUE
04319 rb_str_to_f(VALUE str)
04320 {
04321     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04322 }
04323 
04324 
04325 /*
04326  *  call-seq:
04327  *     str.to_s     -> str
04328  *     str.to_str   -> str
04329  *
04330  *  Returns the receiver.
04331  */
04332 
04333 static VALUE
04334 rb_str_to_s(VALUE str)
04335 {
04336     if (rb_obj_class(str) != rb_cString) {
04337         return str_duplicate(rb_cString, str);
04338     }
04339     return str;
04340 }
04341 
04342 #if 0
04343 static void
04344 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04345 {
04346     char s[RUBY_MAX_CHAR_LEN];
04347     int n = rb_enc_codelen(c, enc);
04348 
04349     rb_enc_mbcput(c, s, enc);
04350     rb_enc_str_buf_cat(str, s, n, enc);
04351 }
04352 #endif
04353 
04354 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04355 
04356 int
04357 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04358 {
04359     char buf[CHAR_ESC_LEN + 1];
04360     int l;
04361 
04362 #if SIZEOF_INT > 4
04363     c &= 0xffffffff;
04364 #endif
04365     if (unicode_p) {
04366         if (c < 0x7F && ISPRINT(c)) {
04367             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04368         }
04369         else if (c < 0x10000) {
04370             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04371         }
04372         else {
04373             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04374         }
04375     }
04376     else {
04377         if (c < 0x100) {
04378             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04379         }
04380         else {
04381             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04382         }
04383     }
04384     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04385     rb_str_buf_cat(result, buf, l);
04386     return l;
04387 }
04388 
04389 /*
04390  * call-seq:
04391  *   str.inspect   -> string
04392  *
04393  * Returns a printable version of _str_, surrounded by quote marks,
04394  * with special characters escaped.
04395  *
04396  *    str = "hello"
04397  *    str[3] = "\b"
04398  *    str.inspect       #=> "\"hel\\bo\""
04399  */
04400 
04401 VALUE
04402 rb_str_inspect(VALUE str)
04403 {
04404     rb_encoding *enc = STR_ENC_GET(str);
04405     const char *p, *pend, *prev;
04406     char buf[CHAR_ESC_LEN + 1];
04407     VALUE result = rb_str_buf_new(0);
04408     rb_encoding *resenc = rb_default_internal_encoding();
04409     int unicode_p = rb_enc_unicode_p(enc);
04410     int asciicompat = rb_enc_asciicompat(enc);
04411     static rb_encoding *utf16, *utf32;
04412 
04413     if (!utf16) utf16 = rb_enc_find("UTF-16");
04414     if (!utf32) utf32 = rb_enc_find("UTF-32");
04415     if (resenc == NULL) resenc = rb_default_external_encoding();
04416     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04417     rb_enc_associate(result, resenc);
04418     str_buf_cat2(result, "\"");
04419 
04420     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04421     prev = p;
04422     if (enc == utf16) {
04423         const unsigned char *q = (const unsigned char *)p;
04424         if (q[0] == 0xFE && q[1] == 0xFF)
04425             enc = rb_enc_find("UTF-16BE");
04426         else if (q[0] == 0xFF && q[1] == 0xFE)
04427             enc = rb_enc_find("UTF-16LE");
04428         else
04429             unicode_p = 0;
04430     }
04431     else if (enc == utf32) {
04432         const unsigned char *q = (const unsigned char *)p;
04433         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04434             enc = rb_enc_find("UTF-32BE");
04435         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04436             enc = rb_enc_find("UTF-32LE");
04437         else
04438             unicode_p = 0;
04439     }
04440     while (p < pend) {
04441         unsigned int c, cc;
04442         int n;
04443 
04444         n = rb_enc_precise_mbclen(p, pend, enc);
04445         if (!MBCLEN_CHARFOUND_P(n)) {
04446             if (p > prev) str_buf_cat(result, prev, p - prev);
04447             n = rb_enc_mbminlen(enc);
04448             if (pend < p + n)
04449                 n = (int)(pend - p);
04450             while (n--) {
04451                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04452                 str_buf_cat(result, buf, strlen(buf));
04453                 prev = ++p;
04454             }
04455             continue;
04456         }
04457         n = MBCLEN_CHARFOUND_LEN(n);
04458         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04459         p += n;
04460         if ((asciicompat || unicode_p) &&
04461           (c == '"'|| c == '\\' ||
04462             (c == '#' &&
04463              p < pend &&
04464              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04465              (cc = rb_enc_codepoint(p,pend,enc),
04466               (cc == '$' || cc == '@' || cc == '{'))))) {
04467             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04468             str_buf_cat2(result, "\\");
04469             if (asciicompat || enc == resenc) {
04470                 prev = p - n;
04471                 continue;
04472             }
04473         }
04474         switch (c) {
04475           case '\n': cc = 'n'; break;
04476           case '\r': cc = 'r'; break;
04477           case '\t': cc = 't'; break;
04478           case '\f': cc = 'f'; break;
04479           case '\013': cc = 'v'; break;
04480           case '\010': cc = 'b'; break;
04481           case '\007': cc = 'a'; break;
04482           case 033: cc = 'e'; break;
04483           default: cc = 0; break;
04484         }
04485         if (cc) {
04486             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04487             buf[0] = '\\';
04488             buf[1] = (char)cc;
04489             str_buf_cat(result, buf, 2);
04490             prev = p;
04491             continue;
04492         }
04493         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04494             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04495             continue;
04496         }
04497         else {
04498             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04499             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04500             prev = p;
04501             continue;
04502         }
04503     }
04504     if (p > prev) str_buf_cat(result, prev, p - prev);
04505     str_buf_cat2(result, "\"");
04506 
04507     OBJ_INFECT(result, str);
04508     return result;
04509 }
04510 
04511 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04512 
04513 /*
04514  *  call-seq:
04515  *     str.dump   -> new_str
04516  *
04517  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04518  *  <code>\nnn</code> notation and all special characters escaped.
04519  */
04520 
04521 VALUE
04522 rb_str_dump(VALUE str)
04523 {
04524     rb_encoding *enc = rb_enc_get(str);
04525     long len;
04526     const char *p, *pend;
04527     char *q, *qend;
04528     VALUE result;
04529     int u8 = (enc == rb_utf8_encoding());
04530 
04531     len = 2;                    /* "" */
04532     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04533     while (p < pend) {
04534         unsigned char c = *p++;
04535         switch (c) {
04536           case '"':  case '\\':
04537           case '\n': case '\r':
04538           case '\t': case '\f':
04539           case '\013': case '\010': case '\007': case '\033':
04540             len += 2;
04541             break;
04542 
04543           case '#':
04544             len += IS_EVSTR(p, pend) ? 2 : 1;
04545             break;
04546 
04547           default:
04548             if (ISPRINT(c)) {
04549                 len++;
04550             }
04551             else {
04552                 if (u8) {       /* \u{NN} */
04553                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04554                     if (MBCLEN_CHARFOUND_P(n-1)) {
04555                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04556                         while (cc >>= 4) len++;
04557                         len += 5;
04558                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04559                         break;
04560                     }
04561                 }
04562                 len += 4;       /* \xNN */
04563             }
04564             break;
04565         }
04566     }
04567     if (!rb_enc_asciicompat(enc)) {
04568         len += 19;              /* ".force_encoding('')" */
04569         len += strlen(enc->name);
04570     }
04571 
04572     result = rb_str_new5(str, 0, len);
04573     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04574     q = RSTRING_PTR(result); qend = q + len + 1;
04575 
04576     *q++ = '"';
04577     while (p < pend) {
04578         unsigned char c = *p++;
04579 
04580         if (c == '"' || c == '\\') {
04581             *q++ = '\\';
04582             *q++ = c;
04583         }
04584         else if (c == '#') {
04585             if (IS_EVSTR(p, pend)) *q++ = '\\';
04586             *q++ = '#';
04587         }
04588         else if (c == '\n') {
04589             *q++ = '\\';
04590             *q++ = 'n';
04591         }
04592         else if (c == '\r') {
04593             *q++ = '\\';
04594             *q++ = 'r';
04595         }
04596         else if (c == '\t') {
04597             *q++ = '\\';
04598             *q++ = 't';
04599         }
04600         else if (c == '\f') {
04601             *q++ = '\\';
04602             *q++ = 'f';
04603         }
04604         else if (c == '\013') {
04605             *q++ = '\\';
04606             *q++ = 'v';
04607         }
04608         else if (c == '\010') {
04609             *q++ = '\\';
04610             *q++ = 'b';
04611         }
04612         else if (c == '\007') {
04613             *q++ = '\\';
04614             *q++ = 'a';
04615         }
04616         else if (c == '\033') {
04617             *q++ = '\\';
04618             *q++ = 'e';
04619         }
04620         else if (ISPRINT(c)) {
04621             *q++ = c;
04622         }
04623         else {
04624             *q++ = '\\';
04625             if (u8) {
04626                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04627                 if (MBCLEN_CHARFOUND_P(n)) {
04628                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04629                     p += n;
04630                     snprintf(q, qend-q, "u{%x}", cc);
04631                     q += strlen(q);
04632                     continue;
04633                 }
04634             }
04635             snprintf(q, qend-q, "x%02X", c);
04636             q += 3;
04637         }
04638     }
04639     *q++ = '"';
04640     *q = '\0';
04641     if (!rb_enc_asciicompat(enc)) {
04642         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04643         enc = rb_ascii8bit_encoding();
04644     }
04645     OBJ_INFECT(result, str);
04646     /* result from dump is ASCII */
04647     rb_enc_associate(result, enc);
04648     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04649     return result;
04650 }
04651 
04652 
04653 static void
04654 rb_str_check_dummy_enc(rb_encoding *enc)
04655 {
04656     if (rb_enc_dummy_p(enc)) {
04657         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04658                  rb_enc_name(enc));
04659     }
04660 }
04661 
04662 /*
04663  *  call-seq:
04664  *     str.upcase!   -> str or nil
04665  *
04666  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04667  *  were made.
04668  *  Note: case replacement is effective only in ASCII region.
04669  */
04670 
04671 static VALUE
04672 rb_str_upcase_bang(VALUE str)
04673 {
04674     rb_encoding *enc;
04675     char *s, *send;
04676     int modify = 0;
04677     int n;
04678 
04679     str_modify_keep_cr(str);
04680     enc = STR_ENC_GET(str);
04681     rb_str_check_dummy_enc(enc);
04682     s = RSTRING_PTR(str); send = RSTRING_END(str);
04683     if (single_byte_optimizable(str)) {
04684         while (s < send) {
04685             unsigned int c = *(unsigned char*)s;
04686 
04687             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04688                 *s = 'A' + (c - 'a');
04689                 modify = 1;
04690             }
04691             s++;
04692         }
04693     }
04694     else {
04695         int ascompat = rb_enc_asciicompat(enc);
04696 
04697         while (s < send) {
04698             unsigned int c;
04699 
04700             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04701                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04702                     *s = 'A' + (c - 'a');
04703                     modify = 1;
04704                 }
04705                 s++;
04706             }
04707             else {
04708                 c = rb_enc_codepoint_len(s, send, &n, enc);
04709                 if (rb_enc_islower(c, enc)) {
04710                     /* assuming toupper returns codepoint with same size */
04711                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04712                     modify = 1;
04713                 }
04714                 s += n;
04715             }
04716         }
04717     }
04718 
04719     if (modify) return str;
04720     return Qnil;
04721 }
04722 
04723 
04724 /*
04725  *  call-seq:
04726  *     str.upcase   -> new_str
04727  *
04728  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04729  *  uppercase counterparts. The operation is locale insensitive---only
04730  *  characters ``a'' to ``z'' are affected.
04731  *  Note: case replacement is effective only in ASCII region.
04732  *
04733  *     "hEllO".upcase   #=> "HELLO"
04734  */
04735 
04736 static VALUE
04737 rb_str_upcase(VALUE str)
04738 {
04739     str = rb_str_dup(str);
04740     rb_str_upcase_bang(str);
04741     return str;
04742 }
04743 
04744 
04745 /*
04746  *  call-seq:
04747  *     str.downcase!   -> str or nil
04748  *
04749  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04750  *  changes were made.
04751  *  Note: case replacement is effective only in ASCII region.
04752  */
04753 
04754 static VALUE
04755 rb_str_downcase_bang(VALUE str)
04756 {
04757     rb_encoding *enc;
04758     char *s, *send;
04759     int modify = 0;
04760 
04761     str_modify_keep_cr(str);
04762     enc = STR_ENC_GET(str);
04763     rb_str_check_dummy_enc(enc);
04764     s = RSTRING_PTR(str); send = RSTRING_END(str);
04765     if (single_byte_optimizable(str)) {
04766         while (s < send) {
04767             unsigned int c = *(unsigned char*)s;
04768 
04769             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04770                 *s = 'a' + (c - 'A');
04771                 modify = 1;
04772             }
04773             s++;
04774         }
04775     }
04776     else {
04777         int ascompat = rb_enc_asciicompat(enc);
04778 
04779         while (s < send) {
04780             unsigned int c;
04781             int n;
04782 
04783             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04784                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04785                     *s = 'a' + (c - 'A');
04786                     modify = 1;
04787                 }
04788                 s++;
04789             }
04790             else {
04791                 c = rb_enc_codepoint_len(s, send, &n, enc);
04792                 if (rb_enc_isupper(c, enc)) {
04793                     /* assuming toupper returns codepoint with same size */
04794                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04795                     modify = 1;
04796                 }
04797                 s += n;
04798             }
04799         }
04800     }
04801 
04802     if (modify) return str;
04803     return Qnil;
04804 }
04805 
04806 
04807 /*
04808  *  call-seq:
04809  *     str.downcase   -> new_str
04810  *
04811  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04812  *  lowercase counterparts. The operation is locale insensitive---only
04813  *  characters ``A'' to ``Z'' are affected.
04814  *  Note: case replacement is effective only in ASCII region.
04815  *
04816  *     "hEllO".downcase   #=> "hello"
04817  */
04818 
04819 static VALUE
04820 rb_str_downcase(VALUE str)
04821 {
04822     str = rb_str_dup(str);
04823     rb_str_downcase_bang(str);
04824     return str;
04825 }
04826 
04827 
04828 /*
04829  *  call-seq:
04830  *     str.capitalize!   -> str or nil
04831  *
04832  *  Modifies <i>str</i> by converting the first character to uppercase and the
04833  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04834  *  Note: case conversion is effective only in ASCII region.
04835  *
04836  *     a = "hello"
04837  *     a.capitalize!   #=> "Hello"
04838  *     a               #=> "Hello"
04839  *     a.capitalize!   #=> nil
04840  */
04841 
04842 static VALUE
04843 rb_str_capitalize_bang(VALUE str)
04844 {
04845     rb_encoding *enc;
04846     char *s, *send;
04847     int modify = 0;
04848     unsigned int c;
04849     int n;
04850 
04851     str_modify_keep_cr(str);
04852     enc = STR_ENC_GET(str);
04853     rb_str_check_dummy_enc(enc);
04854     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04855     s = RSTRING_PTR(str); send = RSTRING_END(str);
04856 
04857     c = rb_enc_codepoint_len(s, send, &n, enc);
04858     if (rb_enc_islower(c, enc)) {
04859         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04860         modify = 1;
04861     }
04862     s += n;
04863     while (s < send) {
04864         c = rb_enc_codepoint_len(s, send, &n, enc);
04865         if (rb_enc_isupper(c, enc)) {
04866             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04867             modify = 1;
04868         }
04869         s += n;
04870     }
04871 
04872     if (modify) return str;
04873     return Qnil;
04874 }
04875 
04876 
04877 /*
04878  *  call-seq:
04879  *     str.capitalize   -> new_str
04880  *
04881  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04882  *  and the remainder to lowercase.
04883  *  Note: case conversion is effective only in ASCII region.
04884  *
04885  *     "hello".capitalize    #=> "Hello"
04886  *     "HELLO".capitalize    #=> "Hello"
04887  *     "123ABC".capitalize   #=> "123abc"
04888  */
04889 
04890 static VALUE
04891 rb_str_capitalize(VALUE str)
04892 {
04893     str = rb_str_dup(str);
04894     rb_str_capitalize_bang(str);
04895     return str;
04896 }
04897 
04898 
04899 /*
04900  *  call-seq:
04901  *     str.swapcase!   -> str or nil
04902  *
04903  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04904  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04905  *  Note: case conversion is effective only in ASCII region.
04906  */
04907 
04908 static VALUE
04909 rb_str_swapcase_bang(VALUE str)
04910 {
04911     rb_encoding *enc;
04912     char *s, *send;
04913     int modify = 0;
04914     int n;
04915 
04916     str_modify_keep_cr(str);
04917     enc = STR_ENC_GET(str);
04918     rb_str_check_dummy_enc(enc);
04919     s = RSTRING_PTR(str); send = RSTRING_END(str);
04920     while (s < send) {
04921         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04922 
04923         if (rb_enc_isupper(c, enc)) {
04924             /* assuming toupper returns codepoint with same size */
04925             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04926             modify = 1;
04927         }
04928         else if (rb_enc_islower(c, enc)) {
04929             /* assuming tolower returns codepoint with same size */
04930             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04931             modify = 1;
04932         }
04933         s += n;
04934     }
04935 
04936     if (modify) return str;
04937     return Qnil;
04938 }
04939 
04940 
04941 /*
04942  *  call-seq:
04943  *     str.swapcase   -> new_str
04944  *
04945  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04946  *  to lowercase and lowercase characters converted to uppercase.
04947  *  Note: case conversion is effective only in ASCII region.
04948  *
04949  *     "Hello".swapcase          #=> "hELLO"
04950  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04951  */
04952 
04953 static VALUE
04954 rb_str_swapcase(VALUE str)
04955 {
04956     str = rb_str_dup(str);
04957     rb_str_swapcase_bang(str);
04958     return str;
04959 }
04960 
04961 typedef unsigned char *USTR;
04962 
04963 struct tr {
04964     int gen;
04965     unsigned int now, max;
04966     char *p, *pend;
04967 };
04968 
04969 static unsigned int
04970 trnext(struct tr *t, rb_encoding *enc)
04971 {
04972     int n;
04973 
04974     for (;;) {
04975         if (!t->gen) {
04976             if (t->p == t->pend) return -1;
04977             if (t->p < t->pend - 1 && *t->p == '\\') {
04978                 t->p++;
04979             }
04980             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04981             t->p += n;
04982             if (t->p < t->pend - 1 && *t->p == '-') {
04983                 t->p++;
04984                 if (t->p < t->pend) {
04985                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04986                     t->p += n;
04987                     if (t->now > c) {
04988                         if (t->now < 0x80 && c < 0x80) {
04989                             rb_raise(rb_eArgError,
04990                                      "invalid range \"%c-%c\" in string transliteration",
04991                                      t->now, c);
04992                         }
04993                         else {
04994                             rb_raise(rb_eArgError, "invalid range in string transliteration");
04995                         }
04996                         continue; /* not reached */
04997                     }
04998                     t->gen = 1;
04999                     t->max = c;
05000                 }
05001             }
05002             return t->now;
05003         }
05004         else if (++t->now < t->max) {
05005             return t->now;
05006         }
05007         else {
05008             t->gen = 0;
05009             return t->max;
05010         }
05011     }
05012 }
05013 
05014 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05015 
05016 static VALUE
05017 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05018 {
05019     const unsigned int errc = -1;
05020     unsigned int trans[256];
05021     rb_encoding *enc, *e1, *e2;
05022     struct tr trsrc, trrepl;
05023     int cflag = 0;
05024     unsigned int c, c0, last = 0;
05025     int modify = 0, i, l;
05026     char *s, *send;
05027     VALUE hash = 0;
05028     int singlebyte = single_byte_optimizable(str);
05029     int cr;
05030 
05031 #define CHECK_IF_ASCII(c) \
05032     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05033            (cr = ENC_CODERANGE_VALID) : 0)
05034 
05035     StringValue(src);
05036     StringValue(repl);
05037     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05038     if (RSTRING_LEN(repl) == 0) {
05039         return rb_str_delete_bang(1, &src, str);
05040     }
05041 
05042     cr = ENC_CODERANGE(str);
05043     e1 = rb_enc_check(str, src);
05044     e2 = rb_enc_check(str, repl);
05045     if (e1 == e2) {
05046         enc = e1;
05047     }
05048     else {
05049         enc = rb_enc_check(src, repl);
05050     }
05051     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05052     if (RSTRING_LEN(src) > 1 &&
05053         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05054         trsrc.p + l < trsrc.pend) {
05055         cflag = 1;
05056         trsrc.p += l;
05057     }
05058     trrepl.p = RSTRING_PTR(repl);
05059     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05060     trsrc.gen = trrepl.gen = 0;
05061     trsrc.now = trrepl.now = 0;
05062     trsrc.max = trrepl.max = 0;
05063 
05064     if (cflag) {
05065         for (i=0; i<256; i++) {
05066             trans[i] = 1;
05067         }
05068         while ((c = trnext(&trsrc, enc)) != errc) {
05069             if (c < 256) {
05070                 trans[c] = errc;
05071             }
05072             else {
05073                 if (!hash) hash = rb_hash_new();
05074                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05075             }
05076         }
05077         while ((c = trnext(&trrepl, enc)) != errc)
05078             /* retrieve last replacer */;
05079         last = trrepl.now;
05080         for (i=0; i<256; i++) {
05081             if (trans[i] != errc) {
05082                 trans[i] = last;
05083             }
05084         }
05085     }
05086     else {
05087         unsigned int r;
05088 
05089         for (i=0; i<256; i++) {
05090             trans[i] = errc;
05091         }
05092         while ((c = trnext(&trsrc, enc)) != errc) {
05093             r = trnext(&trrepl, enc);
05094             if (r == errc) r = trrepl.now;
05095             if (c < 256) {
05096                 trans[c] = r;
05097                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05098             }
05099             else {
05100                 if (!hash) hash = rb_hash_new();
05101                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05102             }
05103         }
05104     }
05105 
05106     if (cr == ENC_CODERANGE_VALID)
05107         cr = ENC_CODERANGE_7BIT;
05108     str_modify_keep_cr(str);
05109     s = RSTRING_PTR(str); send = RSTRING_END(str);
05110     if (sflag) {
05111         int clen, tlen;
05112         long offset, max = RSTRING_LEN(str);
05113         unsigned int save = -1;
05114         char *buf = ALLOC_N(char, max), *t = buf;
05115 
05116         while (s < send) {
05117             int may_modify = 0;
05118 
05119             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05120             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05121 
05122             s += clen;
05123             if (c < 256) {
05124                 c = trans[c];
05125             }
05126             else if (hash) {
05127                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05128                 if (NIL_P(tmp)) {
05129                     if (cflag) c = last;
05130                     else c = errc;
05131                 }
05132                 else if (cflag) c = errc;
05133                 else c = NUM2INT(tmp);
05134             }
05135             else {
05136                 c = errc;
05137             }
05138             if (c != (unsigned int)-1) {
05139                 if (save == c) {
05140                     CHECK_IF_ASCII(c);
05141                     continue;
05142                 }
05143                 save = c;
05144                 tlen = rb_enc_codelen(c, enc);
05145                 modify = 1;
05146             }
05147             else {
05148                 save = -1;
05149                 c = c0;
05150                 if (enc != e1) may_modify = 1;
05151             }
05152             while (t - buf + tlen >= max) {
05153                 offset = t - buf;
05154                 max *= 2;
05155                 REALLOC_N(buf, char, max);
05156                 t = buf + offset;
05157             }
05158             rb_enc_mbcput(c, t, enc);
05159             if (may_modify && memcmp(s, t, tlen) != 0) {
05160                 modify = 1;
05161             }
05162             CHECK_IF_ASCII(c);
05163             t += tlen;
05164         }
05165         if (!STR_EMBED_P(str)) {
05166             xfree(RSTRING(str)->as.heap.ptr);
05167         }
05168         *t = '\0';
05169         RSTRING(str)->as.heap.ptr = buf;
05170         RSTRING(str)->as.heap.len = t - buf;
05171         STR_SET_NOEMBED(str);
05172         RSTRING(str)->as.heap.aux.capa = max;
05173     }
05174     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05175         while (s < send) {
05176             c = (unsigned char)*s;
05177             if (trans[c] != errc) {
05178                 if (!cflag) {
05179                     c = trans[c];
05180                     *s = c;
05181                     modify = 1;
05182                 }
05183                 else {
05184                     *s = last;
05185                     modify = 1;
05186                 }
05187             }
05188             CHECK_IF_ASCII(c);
05189             s++;
05190         }
05191     }
05192     else {
05193         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05194         long offset;
05195         char *buf = ALLOC_N(char, max), *t = buf;
05196 
05197         while (s < send) {
05198             int may_modify = 0;
05199             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05200             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05201 
05202             if (c < 256) {
05203                 c = trans[c];
05204             }
05205             else if (hash) {
05206                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05207                 if (NIL_P(tmp)) {
05208                     if (cflag) c = last;
05209                     else c = errc;
05210                 }
05211                 else if (cflag) c = errc;
05212                 else c = NUM2INT(tmp);
05213             }
05214             else {
05215                 c = cflag ? last : errc;
05216             }
05217             if (c != errc) {
05218                 tlen = rb_enc_codelen(c, enc);
05219                 modify = 1;
05220             }
05221             else {
05222                 c = c0;
05223                 if (enc != e1) may_modify = 1;
05224             }
05225             while (t - buf + tlen >= max) {
05226                 offset = t - buf;
05227                 max *= 2;
05228                 REALLOC_N(buf, char, max);
05229                 t = buf + offset;
05230             }
05231             if (s != t) {
05232                 rb_enc_mbcput(c, t, enc);
05233                 if (may_modify && memcmp(s, t, tlen) != 0) {
05234                     modify = 1;
05235                 }
05236             }
05237             CHECK_IF_ASCII(c);
05238             s += clen;
05239             t += tlen;
05240         }
05241         if (!STR_EMBED_P(str)) {
05242             xfree(RSTRING(str)->as.heap.ptr);
05243         }
05244         *t = '\0';
05245         RSTRING(str)->as.heap.ptr = buf;
05246         RSTRING(str)->as.heap.len = t - buf;
05247         STR_SET_NOEMBED(str);
05248         RSTRING(str)->as.heap.aux.capa = max;
05249     }
05250 
05251     if (modify) {
05252         if (cr != ENC_CODERANGE_BROKEN)
05253             ENC_CODERANGE_SET(str, cr);
05254         rb_enc_associate(str, enc);
05255         return str;
05256     }
05257     return Qnil;
05258 }
05259 
05260 
05261 /*
05262  *  call-seq:
05263  *     str.tr!(from_str, to_str)   -> str or nil
05264  *
05265  *  Translates <i>str</i> in place, using the same rules as
05266  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05267  *  changes were made.
05268  */
05269 
05270 static VALUE
05271 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05272 {
05273     return tr_trans(str, src, repl, 0);
05274 }
05275 
05276 
05277 /*
05278  *  call-seq:
05279  *     str.tr(from_str, to_str)   => new_str
05280  *
05281  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i>
05282  *  replaced by the corresponding characters in <i>to_str</i>. If
05283  *  <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
05284  *  character in order to maintain the correspondence.
05285  *
05286  *     "hello".tr('el', 'ip')      #=> "hippo"
05287  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05288  *
05289  *  Both strings may use the c1-c2 notation to denote ranges of characters,
05290  *  and <i>from_str</i> may start with a <code>^</code>, which denotes all
05291  *  characters except those listed.
05292  *
05293  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05294  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05295  */
05296 
05297 static VALUE
05298 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05299 {
05300     str = rb_str_dup(str);
05301     tr_trans(str, src, repl, 0);
05302     return str;
05303 }
05304 
05305 #define TR_TABLE_SIZE 257
05306 static void
05307 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05308                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05309 {
05310     const unsigned int errc = -1;
05311     char buf[256];
05312     struct tr tr;
05313     unsigned int c;
05314     VALUE table = 0, ptable = 0;
05315     int i, l, cflag = 0;
05316 
05317     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05318     tr.gen = tr.now = tr.max = 0;
05319 
05320     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05321         cflag = 1;
05322         tr.p += l;
05323     }
05324     if (first) {
05325         for (i=0; i<256; i++) {
05326             stable[i] = 1;
05327         }
05328         stable[256] = cflag;
05329     }
05330     else if (stable[256] && !cflag) {
05331         stable[256] = 0;
05332     }
05333     for (i=0; i<256; i++) {
05334         buf[i] = cflag;
05335     }
05336 
05337     while ((c = trnext(&tr, enc)) != errc) {
05338         if (c < 256) {
05339             buf[c & 0xff] = !cflag;
05340         }
05341         else {
05342             VALUE key = UINT2NUM(c);
05343 
05344             if (!table) {
05345                 table = rb_hash_new();
05346                 if (cflag) {
05347                     ptable = *ctablep;
05348                     *ctablep = table;
05349                 }
05350                 else {
05351                     ptable = *tablep;
05352                     *tablep = table;
05353                 }
05354             }
05355             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05356                 rb_hash_aset(table, key, Qtrue);
05357             }
05358         }
05359     }
05360     for (i=0; i<256; i++) {
05361         stable[i] = stable[i] && buf[i];
05362     }
05363 }
05364 
05365 
05366 static int
05367 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05368 {
05369     if (c < 256) {
05370         return table[c] != 0;
05371     }
05372     else {
05373         VALUE v = UINT2NUM(c);
05374 
05375         if (del) {
05376             if (!NIL_P(rb_hash_lookup(del, v)) &&
05377                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05378                 return TRUE;
05379             }
05380         }
05381         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05382             return FALSE;
05383         }
05384         return table[256] ? TRUE : FALSE;
05385     }
05386 }
05387 
05388 /*
05389  *  call-seq:
05390  *     str.delete!([other_str]+)   -> str or nil
05391  *
05392  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05393  *  <code>nil</code> if <i>str</i> was not modified.
05394  */
05395 
05396 static VALUE
05397 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05398 {
05399     char squeez[TR_TABLE_SIZE];
05400     rb_encoding *enc = 0;
05401     char *s, *send, *t;
05402     VALUE del = 0, nodel = 0;
05403     int modify = 0;
05404     int i, ascompat, cr;
05405 
05406     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05407     if (argc < 1) {
05408         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05409     }
05410     for (i=0; i<argc; i++) {
05411         VALUE s = argv[i];
05412 
05413         StringValue(s);
05414         enc = rb_enc_check(str, s);
05415         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05416     }
05417 
05418     str_modify_keep_cr(str);
05419     ascompat = rb_enc_asciicompat(enc);
05420     s = t = RSTRING_PTR(str);
05421     send = RSTRING_END(str);
05422     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05423     while (s < send) {
05424         unsigned int c;
05425         int clen;
05426 
05427         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05428             if (squeez[c]) {
05429                 modify = 1;
05430             }
05431             else {
05432                 if (t != s) *t = c;
05433                 t++;
05434             }
05435             s++;
05436         }
05437         else {
05438             c = rb_enc_codepoint_len(s, send, &clen, enc);
05439 
05440             if (tr_find(c, squeez, del, nodel)) {
05441                 modify = 1;
05442             }
05443             else {
05444                 if (t != s) rb_enc_mbcput(c, t, enc);
05445                 t += clen;
05446                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05447             }
05448             s += clen;
05449         }
05450     }
05451     *t = '\0';
05452     STR_SET_LEN(str, t - RSTRING_PTR(str));
05453     ENC_CODERANGE_SET(str, cr);
05454 
05455     if (modify) return str;
05456     return Qnil;
05457 }
05458 
05459 
05460 /*
05461  *  call-seq:
05462  *     str.delete([other_str]+)   -> new_str
05463  *
05464  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05465  *  arguments deleted. Uses the same rules for building the set of characters as
05466  *  <code>String#count</code>.
05467  *
05468  *     "hello".delete "l","lo"        #=> "heo"
05469  *     "hello".delete "lo"            #=> "he"
05470  *     "hello".delete "aeiou", "^e"   #=> "hell"
05471  *     "hello".delete "ej-m"          #=> "ho"
05472  */
05473 
05474 static VALUE
05475 rb_str_delete(int argc, VALUE *argv, VALUE str)
05476 {
05477     str = rb_str_dup(str);
05478     rb_str_delete_bang(argc, argv, str);
05479     return str;
05480 }
05481 
05482 
05483 /*
05484  *  call-seq:
05485  *     str.squeeze!([other_str]*)   -> str or nil
05486  *
05487  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05488  *  <code>nil</code> if no changes were made.
05489  */
05490 
05491 static VALUE
05492 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05493 {
05494     char squeez[TR_TABLE_SIZE];
05495     rb_encoding *enc = 0;
05496     VALUE del = 0, nodel = 0;
05497     char *s, *send, *t;
05498     int i, modify = 0;
05499     int ascompat, singlebyte = single_byte_optimizable(str);
05500     unsigned int save;
05501 
05502     if (argc == 0) {
05503         enc = STR_ENC_GET(str);
05504     }
05505     else {
05506         for (i=0; i<argc; i++) {
05507             VALUE s = argv[i];
05508 
05509             StringValue(s);
05510             enc = rb_enc_check(str, s);
05511             if (singlebyte && !single_byte_optimizable(s))
05512                 singlebyte = 0;
05513             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05514         }
05515     }
05516 
05517     str_modify_keep_cr(str);
05518     s = t = RSTRING_PTR(str);
05519     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05520     send = RSTRING_END(str);
05521     save = -1;
05522     ascompat = rb_enc_asciicompat(enc);
05523 
05524     if (singlebyte) {
05525         while (s < send) {
05526             unsigned int c = *(unsigned char*)s++;
05527             if (c != save || (argc > 0 && !squeez[c])) {
05528                 *t++ = save = c;
05529             }
05530         }
05531     } else {
05532         while (s < send) {
05533             unsigned int c;
05534             int clen;
05535 
05536             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05537                 if (c != save || (argc > 0 && !squeez[c])) {
05538                     *t++ = save = c;
05539                 }
05540                 s++;
05541             }
05542             else {
05543                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05544 
05545                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05546                     if (t != s) rb_enc_mbcput(c, t, enc);
05547                     save = c;
05548                     t += clen;
05549                 }
05550                 s += clen;
05551             }
05552         }
05553     }
05554 
05555     *t = '\0';
05556     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05557         STR_SET_LEN(str, t - RSTRING_PTR(str));
05558         modify = 1;
05559     }
05560 
05561     if (modify) return str;
05562     return Qnil;
05563 }
05564 
05565 
05566 /*
05567  *  call-seq:
05568  *     str.squeeze([other_str]*)    -> new_str
05569  *
05570  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05571  *  procedure described for <code>String#count</code>. Returns a new string
05572  *  where runs of the same character that occur in this set are replaced by a
05573  *  single character. If no arguments are given, all runs of identical
05574  *  characters are replaced by a single character.
05575  *
05576  *     "yellow moon".squeeze                  #=> "yelow mon"
05577  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05578  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05579  */
05580 
05581 static VALUE
05582 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05583 {
05584     str = rb_str_dup(str);
05585     rb_str_squeeze_bang(argc, argv, str);
05586     return str;
05587 }
05588 
05589 
05590 /*
05591  *  call-seq:
05592  *     str.tr_s!(from_str, to_str)   -> str or nil
05593  *
05594  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05595  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05596  */
05597 
05598 static VALUE
05599 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05600 {
05601     return tr_trans(str, src, repl, 1);
05602 }
05603 
05604 
05605 /*
05606  *  call-seq:
05607  *     str.tr_s(from_str, to_str)   -> new_str
05608  *
05609  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05610  *  then removes duplicate characters in regions that were affected by the
05611  *  translation.
05612  *
05613  *     "hello".tr_s('l', 'r')     #=> "hero"
05614  *     "hello".tr_s('el', '*')    #=> "h*o"
05615  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05616  */
05617 
05618 static VALUE
05619 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05620 {
05621     str = rb_str_dup(str);
05622     tr_trans(str, src, repl, 1);
05623     return str;
05624 }
05625 
05626 
05627 /*
05628  *  call-seq:
05629  *     str.count([other_str]+)   -> fixnum
05630  *
05631  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05632  *  intersection of these sets defines the characters to count in
05633  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05634  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05635  *
05636  *     a = "hello world"
05637  *     a.count "lo"            #=> 5
05638  *     a.count "lo", "o"       #=> 2
05639  *     a.count "hello", "^l"   #=> 4
05640  *     a.count "ej-m"          #=> 4
05641  */
05642 
05643 static VALUE
05644 rb_str_count(int argc, VALUE *argv, VALUE str)
05645 {
05646     char table[TR_TABLE_SIZE];
05647     rb_encoding *enc = 0;
05648     VALUE del = 0, nodel = 0;
05649     char *s, *send;
05650     int i;
05651     int ascompat;
05652 
05653     if (argc < 1) {
05654         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05655     }
05656     for (i=0; i<argc; i++) {
05657         VALUE tstr = argv[i];
05658         unsigned char c;
05659 
05660         StringValue(tstr);
05661         enc = rb_enc_check(str, tstr);
05662         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05663             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05664             int n = 0;
05665 
05666             s = RSTRING_PTR(str);
05667             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05668             send = RSTRING_END(str);
05669             while (s < send) {
05670                 if (*(unsigned char*)s++ == c) n++;
05671             }
05672             return INT2NUM(n);
05673         }
05674         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05675     }
05676 
05677     s = RSTRING_PTR(str);
05678     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05679     send = RSTRING_END(str);
05680     ascompat = rb_enc_asciicompat(enc);
05681     i = 0;
05682     while (s < send) {
05683         unsigned int c;
05684 
05685         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05686             if (table[c]) {
05687                 i++;
05688             }
05689             s++;
05690         }
05691         else {
05692             int clen;
05693             c = rb_enc_codepoint_len(s, send, &clen, enc);
05694             if (tr_find(c, table, del, nodel)) {
05695                 i++;
05696             }
05697             s += clen;
05698         }
05699     }
05700 
05701     return INT2NUM(i);
05702 }
05703 
05704 static const char isspacetable[256] = {
05705     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05706     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05707     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05708     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05709     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05710     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05711     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05712     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05713     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05714     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05715     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05716     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05717     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05718     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05719     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05720     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05721 };
05722 
05723 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05724 
05725 /*
05726  *  call-seq:
05727  *     str.split(pattern=$;, [limit])   -> anArray
05728  *
05729  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05730  *  of these substrings.
05731  *
05732  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05733  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05734  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05735  *  of contiguous whitespace characters ignored.
05736  *
05737  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05738  *  pattern matches. Whenever the pattern matches a zero-length string,
05739  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05740  *  groups, the respective matches will be returned in the array as well.
05741  *
05742  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05743  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05744  *  split on whitespace as if ` ' were specified.
05745  *
05746  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05747  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05748  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05749  *  string is returned as the only entry in an array). If negative, there is no
05750  *  limit to the number of fields returned, and trailing null fields are not
05751  *  suppressed.
05752  *
05753  *     " now's  the time".split        #=> ["now's", "the", "time"]
05754  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05755  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05756  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05757  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05758  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05759  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05760  *
05761  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05762  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05763  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05764  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05765  */
05766 
05767 static VALUE
05768 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05769 {
05770     rb_encoding *enc;
05771     VALUE spat;
05772     VALUE limit;
05773     enum {awk, string, regexp} split_type;
05774     long beg, end, i = 0;
05775     int lim = 0;
05776     VALUE result, tmp;
05777 
05778     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05779         lim = NUM2INT(limit);
05780         if (lim <= 0) limit = Qnil;
05781         else if (lim == 1) {
05782             if (RSTRING_LEN(str) == 0)
05783                 return rb_ary_new2(0);
05784             return rb_ary_new3(1, str);
05785         }
05786         i = 1;
05787     }
05788 
05789     enc = STR_ENC_GET(str);
05790     if (NIL_P(spat)) {
05791         if (!NIL_P(rb_fs)) {
05792             spat = rb_fs;
05793             goto fs_set;
05794         }
05795         split_type = awk;
05796     }
05797     else {
05798       fs_set:
05799         if (TYPE(spat) == T_STRING) {
05800             rb_encoding *enc2 = STR_ENC_GET(spat);
05801 
05802             split_type = string;
05803             if (RSTRING_LEN(spat) == 0) {
05804                 /* Special case - split into chars */
05805                 spat = rb_reg_regcomp(spat);
05806                 split_type = regexp;
05807             }
05808             else if (rb_enc_asciicompat(enc2) == 1) {
05809                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05810                     split_type = awk;
05811                 }
05812             }
05813             else {
05814                 int l;
05815                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05816                     RSTRING_LEN(spat) == l) {
05817                     split_type = awk;
05818                 }
05819             }
05820         }
05821         else {
05822             spat = get_pat(spat, 1);
05823             split_type = regexp;
05824         }
05825     }
05826 
05827     result = rb_ary_new();
05828     beg = 0;
05829     if (split_type == awk) {
05830         char *ptr = RSTRING_PTR(str);
05831         char *eptr = RSTRING_END(str);
05832         char *bptr = ptr;
05833         int skip = 1;
05834         unsigned int c;
05835 
05836         end = beg;
05837         if (is_ascii_string(str)) {
05838             while (ptr < eptr) {
05839                 c = (unsigned char)*ptr++;
05840                 if (skip) {
05841                     if (ascii_isspace(c)) {
05842                         beg = ptr - bptr;
05843                     }
05844                     else {
05845                         end = ptr - bptr;
05846                         skip = 0;
05847                         if (!NIL_P(limit) && lim <= i) break;
05848                     }
05849                 }
05850                 else if (ascii_isspace(c)) {
05851                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05852                     skip = 1;
05853                     beg = ptr - bptr;
05854                     if (!NIL_P(limit)) ++i;
05855                 }
05856                 else {
05857                     end = ptr - bptr;
05858                 }
05859             }
05860         }
05861         else {
05862             while (ptr < eptr) {
05863                 int n;
05864 
05865                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05866                 ptr += n;
05867                 if (skip) {
05868                     if (rb_isspace(c)) {
05869                         beg = ptr - bptr;
05870                     }
05871                     else {
05872                         end = ptr - bptr;
05873                         skip = 0;
05874                         if (!NIL_P(limit) && lim <= i) break;
05875                     }
05876                 }
05877                 else if (rb_isspace(c)) {
05878                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05879                     skip = 1;
05880                     beg = ptr - bptr;
05881                     if (!NIL_P(limit)) ++i;
05882                 }
05883                 else {
05884                     end = ptr - bptr;
05885                 }
05886             }
05887         }
05888     }
05889     else if (split_type == string) {
05890         char *ptr = RSTRING_PTR(str);
05891         char *temp = ptr;
05892         char *eptr = RSTRING_END(str);
05893         char *sptr = RSTRING_PTR(spat);
05894         long slen = RSTRING_LEN(spat);
05895 
05896         if (is_broken_string(str)) {
05897             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05898         }
05899         if (is_broken_string(spat)) {
05900             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05901         }
05902         enc = rb_enc_check(str, spat);
05903         while (ptr < eptr &&
05904                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05905             /* Check we are at the start of a char */
05906             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05907             if (t != ptr + end) {
05908                 ptr = t;
05909                 continue;
05910             }
05911             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05912             ptr += end + slen;
05913             if (!NIL_P(limit) && lim <= ++i) break;
05914         }
05915         beg = ptr - temp;
05916     }
05917     else {
05918         char *ptr = RSTRING_PTR(str);
05919         long len = RSTRING_LEN(str);
05920         long start = beg;
05921         long idx;
05922         int last_null = 0;
05923         struct re_registers *regs;
05924 
05925         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05926             regs = RMATCH_REGS(rb_backref_get());
05927             if (start == end && BEG(0) == END(0)) {
05928                 if (!ptr) {
05929                     rb_ary_push(result, str_new_empty(str));
05930                     break;
05931                 }
05932                 else if (last_null == 1) {
05933                     rb_ary_push(result, rb_str_subseq(str, beg,
05934                                                       rb_enc_fast_mbclen(ptr+beg,
05935                                                                          ptr+len,
05936                                                                          enc)));
05937                     beg = start;
05938                 }
05939                 else {
05940                     if (ptr+start == ptr+len)
05941                         start++;
05942                     else
05943                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05944                     last_null = 1;
05945                     continue;
05946                 }
05947             }
05948             else {
05949                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05950                 beg = start = END(0);
05951             }
05952             last_null = 0;
05953 
05954             for (idx=1; idx < regs->num_regs; idx++) {
05955                 if (BEG(idx) == -1) continue;
05956                 if (BEG(idx) == END(idx))
05957                     tmp = str_new_empty(str);
05958                 else
05959                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05960                 rb_ary_push(result, tmp);
05961             }
05962             if (!NIL_P(limit) && lim <= ++i) break;
05963         }
05964     }
05965     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05966         if (RSTRING_LEN(str) == beg)
05967             tmp = str_new_empty(str);
05968         else
05969             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05970         rb_ary_push(result, tmp);
05971     }
05972     if (NIL_P(limit) && lim == 0) {
05973         long len;
05974         while ((len = RARRAY_LEN(result)) > 0 &&
05975                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05976             rb_ary_pop(result);
05977     }
05978 
05979     return result;
05980 }
05981 
05982 VALUE
05983 rb_str_split(VALUE str, const char *sep0)
05984 {
05985     VALUE sep;
05986 
05987     StringValue(str);
05988     sep = rb_str_new2(sep0);
05989     return rb_str_split_m(1, &sep, str);
05990 }
05991 
05992 
05993 /*
05994  *  call-seq:
05995  *     str.each_line(separator=$/) {|substr| block }   -> str
05996  *     str.each_line(separator=$/)                     -> an_enumerator
05997  *
05998  *     str.lines(separator=$/) {|substr| block }       -> str
05999  *     str.lines(separator=$/)                         -> an_enumerator
06000  *
06001  *  Splits <i>str</i> using the supplied parameter as the record separator
06002  *  (<code>$/</code> by default), passing each substring in turn to the supplied
06003  *  block. If a zero-length record separator is supplied, the string is split
06004  *  into paragraphs delimited by multiple successive newlines.
06005  *
06006  *  If no block is given, an enumerator is returned instead.
06007  *
06008  *     print "Example one\n"
06009  *     "hello\nworld".each_line {|s| p s}
06010  *     print "Example two\n"
06011  *     "hello\nworld".each_line('l') {|s| p s}
06012  *     print "Example three\n"
06013  *     "hello\n\n\nworld".each_line('') {|s| p s}
06014  *
06015  *  <em>produces:</em>
06016  *
06017  *     Example one
06018  *     "hello\n"
06019  *     "world"
06020  *     Example two
06021  *     "hel"
06022  *     "l"
06023  *     "o\nworl"
06024  *     "d"
06025  *     Example three
06026  *     "hello\n\n\n"
06027  *     "world"
06028  */
06029 
06030 static VALUE
06031 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06032 {
06033     rb_encoding *enc;
06034     VALUE rs;
06035     unsigned int newline;
06036     const char *p, *pend, *s, *ptr;
06037     long len, rslen;
06038     VALUE line;
06039     int n;
06040     VALUE orig = str;
06041 
06042     if (argc == 0) {
06043         rs = rb_rs;
06044     }
06045     else {
06046         rb_scan_args(argc, argv, "01", &rs);
06047     }
06048     RETURN_ENUMERATOR(str, argc, argv);
06049     if (NIL_P(rs)) {
06050         rb_yield(str);
06051         return orig;
06052     }
06053     str = rb_str_new4(str);
06054     ptr = p = s = RSTRING_PTR(str);
06055     pend = p + RSTRING_LEN(str);
06056     len = RSTRING_LEN(str);
06057     StringValue(rs);
06058     if (rs == rb_default_rs) {
06059         enc = rb_enc_get(str);
06060         while (p < pend) {
06061             char *p0;
06062 
06063             p = memchr(p, '\n', pend - p);
06064             if (!p) break;
06065             p0 = rb_enc_left_char_head(s, p, pend, enc);
06066             if (!rb_enc_is_newline(p0, pend, enc)) {
06067                 p++;
06068                 continue;
06069             }
06070             p = p0 + rb_enc_mbclen(p0, pend, enc);
06071             line = rb_str_new5(str, s, p - s);
06072             OBJ_INFECT(line, str);
06073             rb_enc_cr_str_copy_for_substr(line, str);
06074             rb_yield(line);
06075             str_mod_check(str, ptr, len);
06076             s = p;
06077         }
06078         goto finish;
06079     }
06080 
06081     enc = rb_enc_check(str, rs);
06082     rslen = RSTRING_LEN(rs);
06083     if (rslen == 0) {
06084         newline = '\n';
06085     }
06086     else {
06087         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06088     }
06089 
06090     while (p < pend) {
06091         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06092 
06093       again:
06094         if (rslen == 0 && c == newline) {
06095             p += n;
06096             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06097                 goto again;
06098             }
06099             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06100                 p += n;
06101             }
06102             p -= n;
06103         }
06104         if (c == newline &&
06105             (rslen <= 1 ||
06106              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06107             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06108             OBJ_INFECT(line, str);
06109             rb_enc_cr_str_copy_for_substr(line, str);
06110             rb_yield(line);
06111             str_mod_check(str, ptr, len);
06112             s = p + (rslen ? rslen : n);
06113         }
06114         p += n;
06115     }
06116 
06117   finish:
06118     if (s != pend) {
06119         line = rb_str_new5(str, s, pend - s);
06120         OBJ_INFECT(line, str);
06121         rb_enc_cr_str_copy_for_substr(line, str);
06122         rb_yield(line);
06123     }
06124 
06125     return orig;
06126 }
06127 
06128 
06129 /*
06130  *  call-seq:
06131  *     str.bytes {|fixnum| block }        -> str
06132  *     str.bytes                          -> an_enumerator
06133  *
06134  *     str.each_byte {|fixnum| block }    -> str
06135  *     str.each_byte                      -> an_enumerator
06136  *
06137  *  Passes each byte in <i>str</i> to the given block, or returns
06138  *  an enumerator if no block is given.
06139  *
06140  *     "hello".each_byte {|c| print c, ' ' }
06141  *
06142  *  <em>produces:</em>
06143  *
06144  *     104 101 108 108 111
06145  */
06146 
06147 static VALUE
06148 rb_str_each_byte(VALUE str)
06149 {
06150     long i;
06151 
06152     RETURN_ENUMERATOR(str, 0, 0);
06153     for (i=0; i<RSTRING_LEN(str); i++) {
06154         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06155     }
06156     return str;
06157 }
06158 
06159 
06160 /*
06161  *  call-seq:
06162  *     str.chars {|cstr| block }        -> str
06163  *     str.chars                        -> an_enumerator
06164  *
06165  *     str.each_char {|cstr| block }    -> str
06166  *     str.each_char                    -> an_enumerator
06167  *
06168  *  Passes each character in <i>str</i> to the given block, or returns
06169  *  an enumerator if no block is given.
06170  *
06171  *     "hello".each_char {|c| print c, ' ' }
06172  *
06173  *  <em>produces:</em>
06174  *
06175  *     h e l l o
06176  */
06177 
06178 static VALUE
06179 rb_str_each_char(VALUE str)
06180 {
06181     VALUE orig = str;
06182     long i, len, n;
06183     const char *ptr;
06184     rb_encoding *enc;
06185 
06186     RETURN_ENUMERATOR(str, 0, 0);
06187     str = rb_str_new4(str);
06188     ptr = RSTRING_PTR(str);
06189     len = RSTRING_LEN(str);
06190     enc = rb_enc_get(str);
06191     switch (ENC_CODERANGE(str)) {
06192       case ENC_CODERANGE_VALID:
06193       case ENC_CODERANGE_7BIT:
06194         for (i = 0; i < len; i += n) {
06195             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06196             rb_yield(rb_str_subseq(str, i, n));
06197         }
06198         break;
06199       default:
06200         for (i = 0; i < len; i += n) {
06201             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06202             rb_yield(rb_str_subseq(str, i, n));
06203         }
06204     }
06205     return orig;
06206 }
06207 
06208 /*
06209  *  call-seq:
06210  *     str.codepoints {|integer| block }        -> str
06211  *     str.codepoints                           -> an_enumerator
06212  *
06213  *     str.each_codepoint {|integer| block }    -> str
06214  *     str.each_codepoint                       -> an_enumerator
06215  *
06216  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
06217  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
06218  *  given block.
06219  *
06220  *  If no block is given, an enumerator is returned instead.
06221  *
06222  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
06223  *
06224  *  <em>produces:</em>
06225  *
06226  *     104 101 108 108 111 1593
06227  */
06228 
06229 static VALUE
06230 rb_str_each_codepoint(VALUE str)
06231 {
06232     VALUE orig = str;
06233     int n;
06234     unsigned int c;
06235     const char *ptr, *end;
06236     rb_encoding *enc;
06237 
06238     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06239     RETURN_ENUMERATOR(str, 0, 0);
06240     str = rb_str_new4(str);
06241     ptr = RSTRING_PTR(str);
06242     end = RSTRING_END(str);
06243     enc = STR_ENC_GET(str);
06244     while (ptr < end) {
06245         c = rb_enc_codepoint_len(ptr, end, &n, enc);
06246         rb_yield(UINT2NUM(c));
06247         ptr += n;
06248     }
06249     return orig;
06250 }
06251 
06252 static long
06253 chopped_length(VALUE str)
06254 {
06255     rb_encoding *enc = STR_ENC_GET(str);
06256     const char *p, *p2, *beg, *end;
06257 
06258     beg = RSTRING_PTR(str);
06259     end = beg + RSTRING_LEN(str);
06260     if (beg > end) return 0;
06261     p = rb_enc_prev_char(beg, end, end, enc);
06262     if (!p) return 0;
06263     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06264         p2 = rb_enc_prev_char(beg, p, end, enc);
06265         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06266     }
06267     return p - beg;
06268 }
06269 
06270 /*
06271  *  call-seq:
06272  *     str.chop!   -> str or nil
06273  *
06274  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06275  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06276  *  <code>String#chomp!</code>.
06277  */
06278 
06279 static VALUE
06280 rb_str_chop_bang(VALUE str)
06281 {
06282     str_modify_keep_cr(str);
06283     if (RSTRING_LEN(str) > 0) {
06284         long len;
06285         len = chopped_length(str);
06286         STR_SET_LEN(str, len);
06287         RSTRING_PTR(str)[len] = '\0';
06288         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06289             ENC_CODERANGE_CLEAR(str);
06290         }
06291         return str;
06292     }
06293     return Qnil;
06294 }
06295 
06296 
06297 /*
06298  *  call-seq:
06299  *     str.chop   -> new_str
06300  *
06301  *  Returns a new <code>String</code> with the last character removed.  If the
06302  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06303  *  <code>chop</code> to an empty string returns an empty
06304  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06305  *  the string unchanged if it doesn't end in a record separator.
06306  *
06307  *     "string\r\n".chop   #=> "string"
06308  *     "string\n\r".chop   #=> "string\n"
06309  *     "string\n".chop     #=> "string"
06310  *     "string".chop       #=> "strin"
06311  *     "x".chop.chop       #=> ""
06312  */
06313 
06314 static VALUE
06315 rb_str_chop(VALUE str)
06316 {
06317     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06318     rb_enc_cr_str_copy_for_substr(str2, str);
06319     OBJ_INFECT(str2, str);
06320     return str2;
06321 }
06322 
06323 
06324 /*
06325  *  call-seq:
06326  *     str.chomp!(separator=$/)   -> str or nil
06327  *
06328  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06329  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06330  */
06331 
06332 static VALUE
06333 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06334 {
06335     rb_encoding *enc;
06336     VALUE rs;
06337     int newline;
06338     char *p, *pp, *e;
06339     long len, rslen;
06340 
06341     str_modify_keep_cr(str);
06342     len = RSTRING_LEN(str);
06343     if (len == 0) return Qnil;
06344     p = RSTRING_PTR(str);
06345     e = p + len;
06346     if (argc == 0) {
06347         rs = rb_rs;
06348         if (rs == rb_default_rs) {
06349           smart_chomp:
06350             enc = rb_enc_get(str);
06351             if (rb_enc_mbminlen(enc) > 1) {
06352                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06353                 if (rb_enc_is_newline(pp, e, enc)) {
06354                     e = pp;
06355                 }
06356                 pp = e - rb_enc_mbminlen(enc);
06357                 if (pp >= p) {
06358                     pp = rb_enc_left_char_head(p, pp, e, enc);
06359                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06360                         e = pp;
06361                     }
06362                 }
06363                 if (e == RSTRING_END(str)) {
06364                     return Qnil;
06365                 }
06366                 len = e - RSTRING_PTR(str);
06367                 STR_SET_LEN(str, len);
06368             }
06369             else {
06370                 if (RSTRING_PTR(str)[len-1] == '\n') {
06371                     STR_DEC_LEN(str);
06372                     if (RSTRING_LEN(str) > 0 &&
06373                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06374                         STR_DEC_LEN(str);
06375                     }
06376                 }
06377                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06378                     STR_DEC_LEN(str);
06379                 }
06380                 else {
06381                     return Qnil;
06382                 }
06383             }
06384             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06385             return str;
06386         }
06387     }
06388     else {
06389         rb_scan_args(argc, argv, "01", &rs);
06390     }
06391     if (NIL_P(rs)) return Qnil;
06392     StringValue(rs);
06393     rslen = RSTRING_LEN(rs);
06394     if (rslen == 0) {
06395         while (len>0 && p[len-1] == '\n') {
06396             len--;
06397             if (len>0 && p[len-1] == '\r')
06398                 len--;
06399         }
06400         if (len < RSTRING_LEN(str)) {
06401             STR_SET_LEN(str, len);
06402             RSTRING_PTR(str)[len] = '\0';
06403             return str;
06404         }
06405         return Qnil;
06406     }
06407     if (rslen > len) return Qnil;
06408     newline = RSTRING_PTR(rs)[rslen-1];
06409     if (rslen == 1 && newline == '\n')
06410         goto smart_chomp;
06411 
06412     enc = rb_enc_check(str, rs);
06413     if (is_broken_string(rs)) {
06414         return Qnil;
06415     }
06416     pp = e - rslen;
06417     if (p[len-1] == newline &&
06418         (rslen <= 1 ||
06419          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06420         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06421             return Qnil;
06422         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06423             ENC_CODERANGE_CLEAR(str);
06424         }
06425         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06426         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06427         return str;
06428     }
06429     return Qnil;
06430 }
06431 
06432 
06433 /*
06434  *  call-seq:
06435  *     str.chomp(separator=$/)   -> new_str
06436  *
06437  *  Returns a new <code>String</code> with the given record separator removed
06438  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06439  *  changed from the default Ruby record separator, then <code>chomp</code> also
06440  *  removes carriage return characters (that is it will remove <code>\n</code>,
06441  *  <code>\r</code>, and <code>\r\n</code>).
06442  *
06443  *     "hello".chomp            #=> "hello"
06444  *     "hello\n".chomp          #=> "hello"
06445  *     "hello\r\n".chomp        #=> "hello"
06446  *     "hello\n\r".chomp        #=> "hello\n"
06447  *     "hello\r".chomp          #=> "hello"
06448  *     "hello \n there".chomp   #=> "hello \n there"
06449  *     "hello".chomp("llo")     #=> "he"
06450  */
06451 
06452 static VALUE
06453 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06454 {
06455     str = rb_str_dup(str);
06456     rb_str_chomp_bang(argc, argv, str);
06457     return str;
06458 }
06459 
06460 /*
06461  *  call-seq:
06462  *     str.lstrip!   -> self or nil
06463  *
06464  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06465  *  change was made. See also <code>String#rstrip!</code> and
06466  *  <code>String#strip!</code>.
06467  *
06468  *     "  hello  ".lstrip   #=> "hello  "
06469  *     "hello".lstrip!      #=> nil
06470  */
06471 
06472 static VALUE
06473 rb_str_lstrip_bang(VALUE str)
06474 {
06475     rb_encoding *enc;
06476     char *s, *t, *e;
06477 
06478     str_modify_keep_cr(str);
06479     enc = STR_ENC_GET(str);
06480     s = RSTRING_PTR(str);
06481     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06482     e = t = RSTRING_END(str);
06483     /* remove spaces at head */
06484     while (s < e) {
06485         int n;
06486         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06487 
06488         if (!rb_isspace(cc)) break;
06489         s += n;
06490     }
06491 
06492     if (s > RSTRING_PTR(str)) {
06493         STR_SET_LEN(str, t-s);
06494         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06495         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06496         return str;
06497     }
06498     return Qnil;
06499 }
06500 
06501 
06502 /*
06503  *  call-seq:
06504  *     str.lstrip   -> new_str
06505  *
06506  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06507  *  <code>String#rstrip</code> and <code>String#strip</code>.
06508  *
06509  *     "  hello  ".lstrip   #=> "hello  "
06510  *     "hello".lstrip       #=> "hello"
06511  */
06512 
06513 static VALUE
06514 rb_str_lstrip(VALUE str)
06515 {
06516     str = rb_str_dup(str);
06517     rb_str_lstrip_bang(str);
06518     return str;
06519 }
06520 
06521 
06522 /*
06523  *  call-seq:
06524  *     str.rstrip!   -> self or nil
06525  *
06526  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06527  *  no change was made. See also <code>String#lstrip!</code> and
06528  *  <code>String#strip!</code>.
06529  *
06530  *     "  hello  ".rstrip   #=> "  hello"
06531  *     "hello".rstrip!      #=> nil
06532  */
06533 
06534 static VALUE
06535 rb_str_rstrip_bang(VALUE str)
06536 {
06537     rb_encoding *enc;
06538     char *s, *t, *e;
06539 
06540     str_modify_keep_cr(str);
06541     enc = STR_ENC_GET(str);
06542     rb_str_check_dummy_enc(enc);
06543     s = RSTRING_PTR(str);
06544     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06545     t = e = RSTRING_END(str);
06546 
06547     /* remove trailing spaces or '\0's */
06548     if (single_byte_optimizable(str)) {
06549         unsigned char c;
06550         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06551     }
06552     else {
06553         char *tp;
06554 
06555         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06556             unsigned int c = rb_enc_codepoint(tp, e, enc);
06557             if (c && !rb_isspace(c)) break;
06558             t = tp;
06559         }
06560     }
06561     if (t < e) {
06562         long len = t-RSTRING_PTR(str);
06563 
06564         STR_SET_LEN(str, len);
06565         RSTRING_PTR(str)[len] = '\0';
06566         return str;
06567     }
06568     return Qnil;
06569 }
06570 
06571 
06572 /*
06573  *  call-seq:
06574  *     str.rstrip   -> new_str
06575  *
06576  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06577  *  <code>String#lstrip</code> and <code>String#strip</code>.
06578  *
06579  *     "  hello  ".rstrip   #=> "  hello"
06580  *     "hello".rstrip       #=> "hello"
06581  */
06582 
06583 static VALUE
06584 rb_str_rstrip(VALUE str)
06585 {
06586     str = rb_str_dup(str);
06587     rb_str_rstrip_bang(str);
06588     return str;
06589 }
06590 
06591 
06592 /*
06593  *  call-seq:
06594  *     str.strip!   -> str or nil
06595  *
06596  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06597  *  <code>nil</code> if <i>str</i> was not altered.
06598  */
06599 
06600 static VALUE
06601 rb_str_strip_bang(VALUE str)
06602 {
06603     VALUE l = rb_str_lstrip_bang(str);
06604     VALUE r = rb_str_rstrip_bang(str);
06605 
06606     if (NIL_P(l) && NIL_P(r)) return Qnil;
06607     return str;
06608 }
06609 
06610 
06611 /*
06612  *  call-seq:
06613  *     str.strip   -> new_str
06614  *
06615  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06616  *
06617  *     "    hello    ".strip   #=> "hello"
06618  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06619  */
06620 
06621 static VALUE
06622 rb_str_strip(VALUE str)
06623 {
06624     str = rb_str_dup(str);
06625     rb_str_strip_bang(str);
06626     return str;
06627 }
06628 
06629 static VALUE
06630 scan_once(VALUE str, VALUE pat, long *start)
06631 {
06632     VALUE result, match;
06633     struct re_registers *regs;
06634     int i;
06635 
06636     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06637         match = rb_backref_get();
06638         regs = RMATCH_REGS(match);
06639         if (BEG(0) == END(0)) {
06640             rb_encoding *enc = STR_ENC_GET(str);
06641             /*
06642              * Always consume at least one character of the input string
06643              */
06644             if (RSTRING_LEN(str) > END(0))
06645                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06646                                                    RSTRING_END(str), enc);
06647             else
06648                 *start = END(0)+1;
06649         }
06650         else {
06651             *start = END(0);
06652         }
06653         if (regs->num_regs == 1) {
06654             return rb_reg_nth_match(0, match);
06655         }
06656         result = rb_ary_new2(regs->num_regs);
06657         for (i=1; i < regs->num_regs; i++) {
06658             rb_ary_push(result, rb_reg_nth_match(i, match));
06659         }
06660 
06661         return result;
06662     }
06663     return Qnil;
06664 }
06665 
06666 
06667 /*
06668  *  call-seq:
06669  *     str.scan(pattern)                         -> array
06670  *     str.scan(pattern) {|match, ...| block }   -> str
06671  *
06672  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06673  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06674  *  generated and either added to the result array or passed to the block. If
06675  *  the pattern contains no groups, each individual result consists of the
06676  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06677  *  individual result is itself an array containing one entry per group.
06678  *
06679  *     a = "cruel world"
06680  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06681  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06682  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06683  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06684  *
06685  *  And the block form:
06686  *
06687  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06688  *     print "\n"
06689  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06690  *     print "\n"
06691  *
06692  *  <em>produces:</em>
06693  *
06694  *     <<cruel>> <<world>>
06695  *     rceu lowlr
06696  */
06697 
06698 static VALUE
06699 rb_str_scan(VALUE str, VALUE pat)
06700 {
06701     VALUE result;
06702     long start = 0;
06703     long last = -1, prev = 0;
06704     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06705 
06706     pat = get_pat(pat, 1);
06707     if (!rb_block_given_p()) {
06708         VALUE ary = rb_ary_new();
06709 
06710         while (!NIL_P(result = scan_once(str, pat, &start))) {
06711             last = prev;
06712             prev = start;
06713             rb_ary_push(ary, result);
06714         }
06715         if (last >= 0) rb_reg_search(pat, str, last, 0);
06716         return ary;
06717     }
06718 
06719     while (!NIL_P(result = scan_once(str, pat, &start))) {
06720         last = prev;
06721         prev = start;
06722         rb_yield(result);
06723         str_mod_check(str, p, len);
06724     }
06725     if (last >= 0) rb_reg_search(pat, str, last, 0);
06726     return str;
06727 }
06728 
06729 
06730 /*
06731  *  call-seq:
06732  *     str.hex   -> integer
06733  *
06734  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06735  *  (with an optional sign and an optional <code>0x</code>) and returns the
06736  *  corresponding number. Zero is returned on error.
06737  *
06738  *     "0x0a".hex     #=> 10
06739  *     "-1234".hex    #=> -4660
06740  *     "0".hex        #=> 0
06741  *     "wombat".hex   #=> 0
06742  */
06743 
06744 static VALUE
06745 rb_str_hex(VALUE str)
06746 {
06747     rb_encoding *enc = rb_enc_get(str);
06748 
06749     if (!rb_enc_asciicompat(enc)) {
06750         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06751     }
06752     return rb_str_to_inum(str, 16, FALSE);
06753 }
06754 
06755 
06756 /*
06757  *  call-seq:
06758  *     str.oct   -> integer
06759  *
06760  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06761  *  optional sign) and returns the corresponding number.  Returns 0 if the
06762  *  conversion fails.
06763  *
06764  *     "123".oct       #=> 83
06765  *     "-377".oct      #=> -255
06766  *     "bad".oct       #=> 0
06767  *     "0377bad".oct   #=> 255
06768  */
06769 
06770 static VALUE
06771 rb_str_oct(VALUE str)
06772 {
06773     rb_encoding *enc = rb_enc_get(str);
06774 
06775     if (!rb_enc_asciicompat(enc)) {
06776         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06777     }
06778     return rb_str_to_inum(str, -8, FALSE);
06779 }
06780 
06781 
06782 /*
06783  *  call-seq:
06784  *     str.crypt(other_str)   -> new_str
06785  *
06786  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06787  *  library function <code>crypt</code>. The argument is the salt string, which
06788  *  should be two characters long, each character drawn from
06789  *  <code>[a-zA-Z0-9./]</code>.
06790  */
06791 
06792 static VALUE
06793 rb_str_crypt(VALUE str, VALUE salt)
06794 {
06795     extern char *crypt(const char *, const char *);
06796     VALUE result;
06797     const char *s, *saltp;
06798     char *res;
06799 #ifdef BROKEN_CRYPT
06800     char salt_8bit_clean[3];
06801 #endif
06802 
06803     StringValue(salt);
06804     if (RSTRING_LEN(salt) < 2)
06805         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06806 
06807     s = RSTRING_PTR(str);
06808     if (!s) s = "";
06809     saltp = RSTRING_PTR(salt);
06810 #ifdef BROKEN_CRYPT
06811     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06812         salt_8bit_clean[0] = saltp[0] & 0x7f;
06813         salt_8bit_clean[1] = saltp[1] & 0x7f;
06814         salt_8bit_clean[2] = '\0';
06815         saltp = salt_8bit_clean;
06816     }
06817 #endif
06818     res = crypt(s, saltp);
06819     if (!res) {
06820         rb_sys_fail("crypt");
06821     }
06822     result = rb_str_new2(res);
06823     OBJ_INFECT(result, str);
06824     OBJ_INFECT(result, salt);
06825     return result;
06826 }
06827 
06828 
06829 /*
06830  *  call-seq:
06831  *     str.intern   -> symbol
06832  *     str.to_sym   -> symbol
06833  *
06834  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06835  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06836  *
06837  *     "Koala".intern         #=> :Koala
06838  *     s = 'cat'.to_sym       #=> :cat
06839  *     s == :cat              #=> true
06840  *     s = '@cat'.to_sym      #=> :@cat
06841  *     s == :@cat             #=> true
06842  *
06843  *  This can also be used to create symbols that cannot be represented using the
06844  *  <code>:xxx</code> notation.
06845  *
06846  *     'cat and dog'.to_sym   #=> :"cat and dog"
06847  */
06848 
06849 VALUE
06850 rb_str_intern(VALUE s)
06851 {
06852     VALUE str = RB_GC_GUARD(s);
06853     ID id;
06854 
06855     id = rb_intern_str(str);
06856     return ID2SYM(id);
06857 }
06858 
06859 
06860 /*
06861  *  call-seq:
06862  *     str.ord   -> integer
06863  *
06864  *  Return the <code>Integer</code> ordinal of a one-character string.
06865  *
06866  *     "a".ord         #=> 97
06867  */
06868 
06869 VALUE
06870 rb_str_ord(VALUE s)
06871 {
06872     unsigned int c;
06873 
06874     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06875     return UINT2NUM(c);
06876 }
06877 /*
06878  *  call-seq:
06879  *     str.sum(n=16)   -> integer
06880  *
06881  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06882  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06883  *  to 16. The result is simply the sum of the binary value of each character in
06884  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06885  *  checksum.
06886  */
06887 
06888 static VALUE
06889 rb_str_sum(int argc, VALUE *argv, VALUE str)
06890 {
06891     VALUE vbits;
06892     int bits;
06893     char *ptr, *p, *pend;
06894     long len;
06895     VALUE sum = INT2FIX(0);
06896     unsigned long sum0 = 0;
06897 
06898     if (argc == 0) {
06899         bits = 16;
06900     }
06901     else {
06902         rb_scan_args(argc, argv, "01", &vbits);
06903         bits = NUM2INT(vbits);
06904     }
06905     ptr = p = RSTRING_PTR(str);
06906     len = RSTRING_LEN(str);
06907     pend = p + len;
06908 
06909     while (p < pend) {
06910         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06911             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06912             str_mod_check(str, ptr, len);
06913             sum0 = 0;
06914         }
06915         sum0 += (unsigned char)*p;
06916         p++;
06917     }
06918 
06919     if (bits == 0) {
06920         if (sum0) {
06921             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06922         }
06923     }
06924     else {
06925         if (sum == INT2FIX(0)) {
06926             if (bits < (int)sizeof(long)*CHAR_BIT) {
06927                 sum0 &= (((unsigned long)1)<<bits)-1;
06928             }
06929             sum = LONG2FIX(sum0);
06930         }
06931         else {
06932             VALUE mod;
06933 
06934             if (sum0) {
06935                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06936             }
06937 
06938             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06939             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06940             sum = rb_funcall(sum, '&', 1, mod);
06941         }
06942     }
06943     return sum;
06944 }
06945 
06946 static VALUE
06947 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06948 {
06949     rb_encoding *enc;
06950     VALUE w;
06951     long width, len, flen = 1, fclen = 1;
06952     VALUE res;
06953     char *p;
06954     const char *f = " ";
06955     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06956     volatile VALUE pad;
06957     int singlebyte = 1, cr;
06958 
06959     rb_scan_args(argc, argv, "11", &w, &pad);
06960     enc = STR_ENC_GET(str);
06961     width = NUM2LONG(w);
06962     if (argc == 2) {
06963         StringValue(pad);
06964         enc = rb_enc_check(str, pad);
06965         f = RSTRING_PTR(pad);
06966         flen = RSTRING_LEN(pad);
06967         fclen = str_strlen(pad, enc);
06968         singlebyte = single_byte_optimizable(pad);
06969         if (flen == 0 || fclen == 0) {
06970             rb_raise(rb_eArgError, "zero width padding");
06971         }
06972     }
06973     len = str_strlen(str, enc);
06974     if (width < 0 || len >= width) return rb_str_dup(str);
06975     n = width - len;
06976     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06977     rlen = n - llen;
06978     cr = ENC_CODERANGE(str);
06979     if (flen > 1) {
06980        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06981        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06982     }
06983     size = RSTRING_LEN(str);
06984     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06985        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06986        (len += llen2 + rlen2) >= LONG_MAX - size) {
06987        rb_raise(rb_eArgError, "argument too big");
06988     }
06989     len += size;
06990     res = rb_str_new5(str, 0, len);
06991     p = RSTRING_PTR(res);
06992     if (flen <= 1) {
06993        memset(p, *f, llen);
06994        p += llen;
06995     }
06996     else {
06997        while (llen >= fclen) {
06998             memcpy(p,f,flen);
06999             p += flen;
07000             llen -= fclen;
07001         }
07002        if (llen > 0) {
07003            memcpy(p, f, llen2);
07004            p += llen2;
07005         }
07006     }
07007     memcpy(p, RSTRING_PTR(str), size);
07008     p += size;
07009     if (flen <= 1) {
07010        memset(p, *f, rlen);
07011        p += rlen;
07012     }
07013     else {
07014        while (rlen >= fclen) {
07015             memcpy(p,f,flen);
07016             p += flen;
07017             rlen -= fclen;
07018         }
07019        if (rlen > 0) {
07020            memcpy(p, f, rlen2);
07021            p += rlen2;
07022         }
07023     }
07024     *p = '\0';
07025     STR_SET_LEN(res, p-RSTRING_PTR(res));
07026     OBJ_INFECT(res, str);
07027     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07028     rb_enc_associate(res, enc);
07029     if (argc == 2)
07030         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07031     if (cr != ENC_CODERANGE_BROKEN)
07032         ENC_CODERANGE_SET(res, cr);
07033     return res;
07034 }
07035 
07036 
07037 /*
07038  *  call-seq:
07039  *     str.ljust(integer, padstr=' ')   -> new_str
07040  *
07041  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07042  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
07043  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07044  *
07045  *     "hello".ljust(4)            #=> "hello"
07046  *     "hello".ljust(20)           #=> "hello               "
07047  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
07048  */
07049 
07050 static VALUE
07051 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07052 {
07053     return rb_str_justify(argc, argv, str, 'l');
07054 }
07055 
07056 
07057 /*
07058  *  call-seq:
07059  *     str.rjust(integer, padstr=' ')   -> new_str
07060  *
07061  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07062  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
07063  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07064  *
07065  *     "hello".rjust(4)            #=> "hello"
07066  *     "hello".rjust(20)           #=> "               hello"
07067  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
07068  */
07069 
07070 static VALUE
07071 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07072 {
07073     return rb_str_justify(argc, argv, str, 'r');
07074 }
07075 
07076 
07077 /*
07078  *  call-seq:
07079  *     str.center(integer, padstr)   -> new_str
07080  *
07081  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07082  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
07083  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07084  *
07085  *     "hello".center(4)         #=> "hello"
07086  *     "hello".center(20)        #=> "       hello        "
07087  *     "hello".center(20, '123') #=> "1231231hello12312312"
07088  */
07089 
07090 static VALUE
07091 rb_str_center(int argc, VALUE *argv, VALUE str)
07092 {
07093     return rb_str_justify(argc, argv, str, 'c');
07094 }
07095 
07096 /*
07097  *  call-seq:
07098  *     str.partition(sep)              -> [head, sep, tail]
07099  *     str.partition(regexp)           -> [head, match, tail]
07100  *
07101  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
07102  *  and returns the part before it, the match, and the part
07103  *  after it.
07104  *  If it is not found, returns two empty strings and <i>str</i>.
07105  *
07106  *     "hello".partition("l")         #=> ["he", "l", "lo"]
07107  *     "hello".partition("x")         #=> ["hello", "", ""]
07108  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
07109  */
07110 
07111 static VALUE
07112 rb_str_partition(VALUE str, VALUE sep)
07113 {
07114     long pos;
07115     int regex = FALSE;
07116 
07117     if (TYPE(sep) == T_REGEXP) {
07118         pos = rb_reg_search(sep, str, 0, 0);
07119         regex = TRUE;
07120     }
07121     else {
07122         VALUE tmp;
07123 
07124         tmp = rb_check_string_type(sep);
07125         if (NIL_P(tmp)) {
07126             rb_raise(rb_eTypeError, "type mismatch: %s given",
07127                      rb_obj_classname(sep));
07128         }
07129         sep = tmp;
07130         pos = rb_str_index(str, sep, 0);
07131     }
07132     if (pos < 0) {
07133       failed:
07134         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07135     }
07136     if (regex) {
07137         sep = rb_str_subpat(str, sep, INT2FIX(0));
07138         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07139     }
07140     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07141                           sep,
07142                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
07143                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07144 }
07145 
07146 /*
07147  *  call-seq:
07148  *     str.rpartition(sep)             -> [head, sep, tail]
07149  *     str.rpartition(regexp)          -> [head, match, tail]
07150  *
07151  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
07152  *  of the string, and returns the part before it, the match, and the part
07153  *  after it.
07154  *  If it is not found, returns two empty strings and <i>str</i>.
07155  *
07156  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
07157  *     "hello".rpartition("x")         #=> ["", "", "hello"]
07158  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
07159  */
07160 
07161 static VALUE
07162 rb_str_rpartition(VALUE str, VALUE sep)
07163 {
07164     long pos = RSTRING_LEN(str);
07165     int regex = FALSE;
07166 
07167     if (TYPE(sep) == T_REGEXP) {
07168         pos = rb_reg_search(sep, str, pos, 1);
07169         regex = TRUE;
07170     }
07171     else {
07172         VALUE tmp;
07173 
07174         tmp = rb_check_string_type(sep);
07175         if (NIL_P(tmp)) {
07176             rb_raise(rb_eTypeError, "type mismatch: %s given",
07177                      rb_obj_classname(sep));
07178         }
07179         sep = tmp;
07180         pos = rb_str_sublen(str, pos);
07181         pos = rb_str_rindex(str, sep, pos);
07182     }
07183     if (pos < 0) {
07184         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07185     }
07186     if (regex) {
07187         sep = rb_reg_nth_match(0, rb_backref_get());
07188     }
07189     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07190                           sep,
07191                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07192 }
07193 
07194 /*
07195  *  call-seq:
07196  *     str.start_with?([prefix]+)   -> true or false
07197  *
07198  *  Returns true if <i>str</i> starts with one of the prefixes given.
07199  *
07200  *    p "hello".start_with?("hell")               #=> true
07201  *
07202  *    # returns true if one of the prefixes matches.
07203  *    p "hello".start_with?("heaven", "hell")     #=> true
07204  *    p "hello".start_with?("heaven", "paradise") #=> false
07205  *
07206  *
07207  *
07208  */
07209 
07210 static VALUE
07211 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07212 {
07213     int i;
07214 
07215     for (i=0; i<argc; i++) {
07216         VALUE tmp = rb_check_string_type(argv[i]);
07217         if (NIL_P(tmp)) continue;
07218         rb_enc_check(str, tmp);
07219         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07220         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07221             return Qtrue;
07222     }
07223     return Qfalse;
07224 }
07225 
07226 /*
07227  *  call-seq:
07228  *     str.end_with?([suffix]+)   -> true or false
07229  *
07230  *  Returns true if <i>str</i> ends with one of the suffixes given.
07231  */
07232 
07233 static VALUE
07234 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07235 {
07236     int i;
07237     char *p, *s, *e;
07238     rb_encoding *enc;
07239 
07240     for (i=0; i<argc; i++) {
07241         VALUE tmp = rb_check_string_type(argv[i]);
07242         if (NIL_P(tmp)) continue;
07243         enc = rb_enc_check(str, tmp);
07244         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07245         p = RSTRING_PTR(str);
07246         e = p + RSTRING_LEN(str);
07247         s = e - RSTRING_LEN(tmp);
07248         if (rb_enc_left_char_head(p, s, e, enc) != s)
07249             continue;
07250         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07251             return Qtrue;
07252     }
07253     return Qfalse;
07254 }
07255 
07256 void
07257 rb_str_setter(VALUE val, ID id, VALUE *var)
07258 {
07259     if (!NIL_P(val) && TYPE(val) != T_STRING) {
07260         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07261     }
07262     *var = val;
07263 }
07264 
07265 
07266 /*
07267  *  call-seq:
07268  *     str.force_encoding(encoding)   -> str
07269  *
07270  *  Changes the encoding to +encoding+ and returns self.
07271  */
07272 
07273 static VALUE
07274 rb_str_force_encoding(VALUE str, VALUE enc)
07275 {
07276     str_modifiable(str);
07277     rb_enc_associate(str, rb_to_encoding(enc));
07278     ENC_CODERANGE_CLEAR(str);
07279     return str;
07280 }
07281 
07282 /*
07283  *  call-seq:
07284  *     str.valid_encoding?  -> true or false
07285  *
07286  *  Returns true for a string which encoded correctly.
07287  *
07288  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07289  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07290  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07291  */
07292 
07293 static VALUE
07294 rb_str_valid_encoding_p(VALUE str)
07295 {
07296     int cr = rb_enc_str_coderange(str);
07297 
07298     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07299 }
07300 
07301 /*
07302  *  call-seq:
07303  *     str.ascii_only?  -> true or false
07304  *
07305  *  Returns true for a string which has only ASCII characters.
07306  *
07307  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07308  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07309  */
07310 
07311 static VALUE
07312 rb_str_is_ascii_only_p(VALUE str)
07313 {
07314     int cr = rb_enc_str_coderange(str);
07315 
07316     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07317 }
07318 
07333 VALUE
07334 rb_str_ellipsize(VALUE str, long len)
07335 {
07336     static const char ellipsis[] = "...";
07337     const long ellipsislen = sizeof(ellipsis) - 1;
07338     rb_encoding *const enc = rb_enc_get(str);
07339     const long blen = RSTRING_LEN(str);
07340     const char *const p = RSTRING_PTR(str), *e = p + blen;
07341     VALUE estr, ret = 0;
07342 
07343     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07344     if (len * rb_enc_mbminlen(enc) >= blen ||
07345         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07346         ret = str;
07347     }
07348     else if (len <= ellipsislen ||
07349              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07350         if (rb_enc_asciicompat(enc)) {
07351             ret = rb_str_new_with_class(str, ellipsis, len);
07352             rb_enc_associate(ret, enc);
07353         }
07354         else {
07355             estr = rb_usascii_str_new(ellipsis, len);
07356             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07357         }
07358     }
07359     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07360         rb_str_cat(ret, ellipsis, ellipsislen);
07361     }
07362     else {
07363         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07364                              rb_enc_from_encoding(enc), 0, Qnil);
07365         rb_str_append(ret, estr);
07366     }
07367     return ret;
07368 }
07369 
07370 /**********************************************************************
07371  * Document-class: Symbol
07372  *
07373  *  <code>Symbol</code> objects represent names and some strings
07374  *  inside the Ruby
07375  *  interpreter. They are generated using the <code>:name</code> and
07376  *  <code>:"string"</code> literals
07377  *  syntax, and by the various <code>to_sym</code> methods. The same
07378  *  <code>Symbol</code> object will be created for a given name or string
07379  *  for the duration of a program's execution, regardless of the context
07380  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07381  *  one context, a method in another, and a class in a third, the
07382  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07383  *  all three contexts.
07384  *
07385  *     module One
07386  *       class Fred
07387  *       end
07388  *       $f1 = :Fred
07389  *     end
07390  *     module Two
07391  *       Fred = 1
07392  *       $f2 = :Fred
07393  *     end
07394  *     def Fred()
07395  *     end
07396  *     $f3 = :Fred
07397  *     $f1.object_id   #=> 2514190
07398  *     $f2.object_id   #=> 2514190
07399  *     $f3.object_id   #=> 2514190
07400  *
07401  */
07402 
07403 
07404 /*
07405  *  call-seq:
07406  *     sym == obj   -> true or false
07407  *
07408  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07409  *  symbol, returns <code>true</code>.
07410  */
07411 
07412 static VALUE
07413 sym_equal(VALUE sym1, VALUE sym2)
07414 {
07415     if (sym1 == sym2) return Qtrue;
07416     return Qfalse;
07417 }
07418 
07419 
07420 static int
07421 sym_printable(const char *s, const char *send, rb_encoding *enc)
07422 {
07423     while (s < send) {
07424         int n;
07425         int c = rb_enc_codepoint_len(s, send, &n, enc);
07426 
07427         if (!rb_enc_isprint(c, enc)) return FALSE;
07428         s += n;
07429     }
07430     return TRUE;
07431 }
07432 
07433 /*
07434  *  call-seq:
07435  *     sym.inspect    -> string
07436  *
07437  *  Returns the representation of <i>sym</i> as a symbol literal.
07438  *
07439  *     :fred.inspect   #=> ":fred"
07440  */
07441 
07442 static VALUE
07443 sym_inspect(VALUE sym)
07444 {
07445     VALUE str;
07446     ID id = SYM2ID(sym);
07447     rb_encoding *enc;
07448     const char *ptr;
07449     long len;
07450     char *dest;
07451     rb_encoding *resenc = rb_default_internal_encoding();
07452 
07453     if (resenc == NULL) resenc = rb_default_external_encoding();
07454     sym = rb_id2str(id);
07455     enc = STR_ENC_GET(sym);
07456     ptr = RSTRING_PTR(sym);
07457     len = RSTRING_LEN(sym);
07458     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07459         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07460         str = rb_str_inspect(sym);
07461         len = RSTRING_LEN(str);
07462         rb_str_resize(str, len + 1);
07463         dest = RSTRING_PTR(str);
07464         memmove(dest + 1, dest, len);
07465         dest[0] = ':';
07466     }
07467     else {
07468         char *dest;
07469         str = rb_enc_str_new(0, len + 1, enc);
07470         dest = RSTRING_PTR(str);
07471         dest[0] = ':';
07472         memcpy(dest + 1, ptr, len);
07473     }
07474     return str;
07475 }
07476 
07477 
07478 /*
07479  *  call-seq:
07480  *     sym.id2name   -> string
07481  *     sym.to_s      -> string
07482  *
07483  *  Returns the name or string corresponding to <i>sym</i>.
07484  *
07485  *     :fred.id2name   #=> "fred"
07486  */
07487 
07488 
07489 VALUE
07490 rb_sym_to_s(VALUE sym)
07491 {
07492     ID id = SYM2ID(sym);
07493 
07494     return str_new3(rb_cString, rb_id2str(id));
07495 }
07496 
07497 
07498 /*
07499  * call-seq:
07500  *   sym.to_sym   -> sym
07501  *   sym.intern   -> sym
07502  *
07503  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07504  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07505  * in this case.
07506  */
07507 
07508 static VALUE
07509 sym_to_sym(VALUE sym)
07510 {
07511     return sym;
07512 }
07513 
07514 static VALUE
07515 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07516 {
07517     VALUE obj;
07518 
07519     if (argc < 1) {
07520         rb_raise(rb_eArgError, "no receiver given");
07521     }
07522     obj = argv[0];
07523     return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07524 }
07525 
07526 /*
07527  * call-seq:
07528  *   sym.to_proc
07529  *
07530  * Returns a _Proc_ object which respond to the given method by _sym_.
07531  *
07532  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07533  */
07534 
07535 static VALUE
07536 sym_to_proc(VALUE sym)
07537 {
07538     static VALUE sym_proc_cache = Qfalse;
07539     enum {SYM_PROC_CACHE_SIZE = 67};
07540     VALUE proc;
07541     long id, index;
07542     VALUE *aryp;
07543 
07544     if (!sym_proc_cache) {
07545         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07546         rb_gc_register_mark_object(sym_proc_cache);
07547         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07548     }
07549 
07550     id = SYM2ID(sym);
07551     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07552 
07553     aryp = RARRAY_PTR(sym_proc_cache);
07554     if (aryp[index] == sym) {
07555         return aryp[index + 1];
07556     }
07557     else {
07558         proc = rb_proc_new(sym_call, (VALUE)id);
07559         aryp[index] = sym;
07560         aryp[index + 1] = proc;
07561         return proc;
07562     }
07563 }
07564 
07565 /*
07566  * call-seq:
07567  *
07568  *   sym.succ
07569  *
07570  * Same as <code>sym.to_s.succ.intern</code>.
07571  */
07572 
07573 static VALUE
07574 sym_succ(VALUE sym)
07575 {
07576     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07577 }
07578 
07579 /*
07580  * call-seq:
07581  *
07582  *   str <=> other       -> -1, 0, +1 or nil
07583  *
07584  * Compares _sym_ with _other_ in string form.
07585  */
07586 
07587 static VALUE
07588 sym_cmp(VALUE sym, VALUE other)
07589 {
07590     if (!SYMBOL_P(other)) {
07591         return Qnil;
07592     }
07593     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07594 }
07595 
07596 /*
07597  * call-seq:
07598  *
07599  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07600  *
07601  * Case-insensitive version of <code>Symbol#<=></code>.
07602  */
07603 
07604 static VALUE
07605 sym_casecmp(VALUE sym, VALUE other)
07606 {
07607     if (!SYMBOL_P(other)) {
07608         return Qnil;
07609     }
07610     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07611 }
07612 
07613 /*
07614  * call-seq:
07615  *   sym =~ obj   -> fixnum or nil
07616  *
07617  * Returns <code>sym.to_s =~ obj</code>.
07618  */
07619 
07620 static VALUE
07621 sym_match(VALUE sym, VALUE other)
07622 {
07623     return rb_str_match(rb_sym_to_s(sym), other);
07624 }
07625 
07626 /*
07627  * call-seq:
07628  *   sym[idx]      -> char
07629  *   sym[b, n]     -> char
07630  *
07631  * Returns <code>sym.to_s[]</code>.
07632  */
07633 
07634 static VALUE
07635 sym_aref(int argc, VALUE *argv, VALUE sym)
07636 {
07637     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07638 }
07639 
07640 /*
07641  * call-seq:
07642  *   sym.length    -> integer
07643  *
07644  * Same as <code>sym.to_s.length</code>.
07645  */
07646 
07647 static VALUE
07648 sym_length(VALUE sym)
07649 {
07650     return rb_str_length(rb_id2str(SYM2ID(sym)));
07651 }
07652 
07653 /*
07654  * call-seq:
07655  *   sym.empty?   -> true or false
07656  *
07657  * Returns that _sym_ is :"" or not.
07658  */
07659 
07660 static VALUE
07661 sym_empty(VALUE sym)
07662 {
07663     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07664 }
07665 
07666 /*
07667  * call-seq:
07668  *   sym.upcase    -> symbol
07669  *
07670  * Same as <code>sym.to_s.upcase.intern</code>.
07671  */
07672 
07673 static VALUE
07674 sym_upcase(VALUE sym)
07675 {
07676     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07677 }
07678 
07679 /*
07680  * call-seq:
07681  *   sym.downcase  -> symbol
07682  *
07683  * Same as <code>sym.to_s.downcase.intern</code>.
07684  */
07685 
07686 static VALUE
07687 sym_downcase(VALUE sym)
07688 {
07689     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07690 }
07691 
07692 /*
07693  * call-seq:
07694  *   sym.capitalize  -> symbol
07695  *
07696  * Same as <code>sym.to_s.capitalize.intern</code>.
07697  */
07698 
07699 static VALUE
07700 sym_capitalize(VALUE sym)
07701 {
07702     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07703 }
07704 
07705 /*
07706  * call-seq:
07707  *   sym.swapcase  -> symbol
07708  *
07709  * Same as <code>sym.to_s.swapcase.intern</code>.
07710  */
07711 
07712 static VALUE
07713 sym_swapcase(VALUE sym)
07714 {
07715     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07716 }
07717 
07718 /*
07719  * call-seq:
07720  *   sym.encoding   -> encoding
07721  *
07722  * Returns the Encoding object that represents the encoding of _sym_.
07723  */
07724 
07725 static VALUE
07726 sym_encoding(VALUE sym)
07727 {
07728     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07729 }
07730 
07731 ID
07732 rb_to_id(VALUE name)
07733 {
07734     VALUE tmp;
07735 
07736     switch (TYPE(name)) {
07737       default:
07738         tmp = rb_check_string_type(name);
07739         if (NIL_P(tmp)) {
07740             tmp = rb_inspect(name);
07741             rb_raise(rb_eTypeError, "%s is not a symbol",
07742                      RSTRING_PTR(tmp));
07743         }
07744         name = tmp;
07745         /* fall through */
07746       case T_STRING:
07747         name = rb_str_intern(name);
07748         /* fall through */
07749       case T_SYMBOL:
07750         return SYM2ID(name);
07751     }
07752     return Qnil; /* not reached */
07753 }
07754 
07755 /*
07756  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07757  *  bytes, typically representing characters. String objects may be created
07758  *  using <code>String::new</code> or as literals.
07759  *
07760  *  Because of aliasing issues, users of strings should be aware of the methods
07761  *  that modify the contents of a <code>String</code> object.  Typically,
07762  *  methods with names ending in ``!'' modify their receiver, while those
07763  *  without a ``!'' return a new <code>String</code>.  However, there are
07764  *  exceptions, such as <code>String#[]=</code>.
07765  *
07766  */
07767 
07768 void
07769 Init_String(void)
07770 {
07771 #undef rb_intern
07772 #define rb_intern(str) rb_intern_const(str)
07773 
07774     rb_cString  = rb_define_class("String", rb_cObject);
07775     rb_include_module(rb_cString, rb_mComparable);
07776     rb_define_alloc_func(rb_cString, str_alloc);
07777     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07778     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07779     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07780     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07781     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07782     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07783     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07784     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07785     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07786     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07787     rb_define_method(rb_cString, "*", rb_str_times, 1);
07788     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07789     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07790     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07791     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07792     rb_define_method(rb_cString, "length", rb_str_length, 0);
07793     rb_define_method(rb_cString, "size", rb_str_length, 0);
07794     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07795     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07796     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07797     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07798     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07799     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07800     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07801     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07802     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07803     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07804     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07805     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07806     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07807     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07808     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07809     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07810     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07811 
07812     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07813     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07814     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07815     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07816     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07817     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07818 
07819     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07820     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07821     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07822     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07823 
07824     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07825     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07826     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07827     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07828 
07829     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07830     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07831     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07832     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07833     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07834     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07835     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07836     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07837     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07838     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07839     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07840     rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07841     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07842     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07843     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07844     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07845 
07846     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07847     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07848     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07849 
07850     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07851 
07852     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07853     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07854     rb_define_method(rb_cString, "center", rb_str_center, -1);
07855 
07856     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07857     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07858     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07859     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07860     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07861     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07862     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07863 
07864     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07865     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07866     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07867     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07868     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07869     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07870     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07871 
07872     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07873     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07874     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07875     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07876     rb_define_method(rb_cString, "count", rb_str_count, -1);
07877 
07878     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07879     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07880     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07881     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07882 
07883     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07884     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07885     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07886     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07887 
07888     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07889 
07890     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07891     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07892 
07893     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07894     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07895 
07896     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07897     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07898     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07899     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07900 
07901     id_to_s = rb_intern("to_s");
07902 
07903     rb_fs = Qnil;
07904     rb_define_variable("$;", &rb_fs);
07905     rb_define_variable("$-F", &rb_fs);
07906 
07907     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07908     rb_include_module(rb_cSymbol, rb_mComparable);
07909     rb_undef_alloc_func(rb_cSymbol);
07910     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07911     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07912 
07913     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07914     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07915     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07916     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07917     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07918     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07919     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07920     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07921     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07922     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07923 
07924     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07925     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07926     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07927 
07928     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07929     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07930     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07931     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07932     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07933     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07934 
07935     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07936     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07937     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07938     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07939 
07940     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07941 }
07942