|
Ruby
1.9.3p448(2013-06-27revision41675)
|
00001 /********************************************************************** 00002 00003 string.c - 00004 00005 $Author: usa $ 00006 created at: Mon Aug 9 17:12:58 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 00010 Copyright (C) 2000 Information-technology Promotion Agency, Japan 00011 00012 **********************************************************************/ 00013 00014 #include "ruby/ruby.h" 00015 #include "ruby/re.h" 00016 #include "ruby/encoding.h" 00017 #include "internal.h" 00018 #include <assert.h> 00019 00020 #define BEG(no) (regs->beg[(no)]) 00021 #define END(no) (regs->end[(no)]) 00022 00023 #include <math.h> 00024 #include <ctype.h> 00025 00026 #ifdef HAVE_UNISTD_H 00027 #include <unistd.h> 00028 #endif 00029 00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00031 00032 #undef rb_str_new_cstr 00033 #undef rb_tainted_str_new_cstr 00034 #undef rb_usascii_str_new_cstr 00035 #undef rb_external_str_new_cstr 00036 #undef rb_locale_str_new_cstr 00037 #undef rb_str_new2 00038 #undef rb_str_new3 00039 #undef rb_str_new4 00040 #undef rb_str_new5 00041 #undef rb_tainted_str_new2 00042 #undef rb_usascii_str_new2 00043 #undef rb_str_dup_frozen 00044 #undef rb_str_buf_new_cstr 00045 #undef rb_str_buf_new2 00046 #undef rb_str_buf_cat2 00047 #undef rb_str_cat2 00048 00049 static VALUE rb_str_clear(VALUE str); 00050 00051 VALUE rb_cString; 00052 VALUE rb_cSymbol; 00053 00054 #define RUBY_MAX_CHAR_LEN 16 00055 #define STR_TMPLOCK FL_USER7 00056 #define STR_NOEMBED FL_USER1 00057 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 00058 #define STR_ASSOC FL_USER3 00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 00063 #define STR_UNSET_NOCAPA(s) do {\ 00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 00065 } while (0) 00066 00067 00068 #define STR_SET_NOEMBED(str) do {\ 00069 FL_SET((str), STR_NOEMBED);\ 00070 STR_SET_EMBED_LEN((str), 0);\ 00071 } while (0) 00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 00074 #define STR_SET_EMBED_LEN(str, n) do { \ 00075 long tmp_n = (n);\ 00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 00078 } while (0) 00079 00080 #define STR_SET_LEN(str, n) do { \ 00081 if (STR_EMBED_P(str)) {\ 00082 STR_SET_EMBED_LEN((str), (n));\ 00083 }\ 00084 else {\ 00085 RSTRING(str)->as.heap.len = (n);\ 00086 }\ 00087 } while (0) 00088 00089 #define STR_DEC_LEN(str) do {\ 00090 if (STR_EMBED_P(str)) {\ 00091 long n = RSTRING_LEN(str);\ 00092 n--;\ 00093 STR_SET_EMBED_LEN((str), n);\ 00094 }\ 00095 else {\ 00096 RSTRING(str)->as.heap.len--;\ 00097 }\ 00098 } while (0) 00099 00100 #define RESIZE_CAPA(str,capacity) do {\ 00101 if (STR_EMBED_P(str)) {\ 00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 00103 char *tmp = ALLOC_N(char, (capacity)+1);\ 00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 00105 RSTRING(str)->as.heap.ptr = tmp;\ 00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 00107 STR_SET_NOEMBED(str);\ 00108 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00109 }\ 00110 }\ 00111 else {\ 00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 00113 if (!STR_NOCAPA_P(str))\ 00114 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00115 }\ 00116 } while (0) 00117 00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 00120 00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 00122 00123 static inline int 00124 single_byte_optimizable(VALUE str) 00125 { 00126 rb_encoding *enc; 00127 00128 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 00130 return 1; 00131 00132 enc = STR_ENC_GET(str); 00133 if (rb_enc_mbmaxlen(enc) == 1) 00134 return 1; 00135 00136 /* Conservative. Possibly single byte. 00137 * "\xa1" in Shift_JIS for example. */ 00138 return 0; 00139 } 00140 00141 VALUE rb_fs; 00142 00143 static inline const char * 00144 search_nonascii(const char *p, const char *e) 00145 { 00146 #if SIZEOF_VALUE == 8 00147 # define NONASCII_MASK 0x8080808080808080ULL 00148 #elif SIZEOF_VALUE == 4 00149 # define NONASCII_MASK 0x80808080UL 00150 #endif 00151 #ifdef NONASCII_MASK 00152 if ((int)sizeof(VALUE) * 2 < e - p) { 00153 const VALUE *s, *t; 00154 const VALUE lowbits = sizeof(VALUE) - 1; 00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 00156 while (p < (const char *)s) { 00157 if (!ISASCII(*p)) 00158 return p; 00159 p++; 00160 } 00161 t = (const VALUE*)(~lowbits & (VALUE)e); 00162 while (s < t) { 00163 if (*s & NONASCII_MASK) { 00164 t = s; 00165 break; 00166 } 00167 s++; 00168 } 00169 p = (const char *)t; 00170 } 00171 #endif 00172 while (p < e) { 00173 if (!ISASCII(*p)) 00174 return p; 00175 p++; 00176 } 00177 return NULL; 00178 } 00179 00180 static int 00181 coderange_scan(const char *p, long len, rb_encoding *enc) 00182 { 00183 const char *e = p + len; 00184 00185 if (rb_enc_to_index(enc) == 0) { 00186 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00187 p = search_nonascii(p, e); 00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 00189 } 00190 00191 if (rb_enc_asciicompat(enc)) { 00192 p = search_nonascii(p, e); 00193 if (!p) { 00194 return ENC_CODERANGE_7BIT; 00195 } 00196 while (p < e) { 00197 int ret = rb_enc_precise_mbclen(p, e, enc); 00198 if (!MBCLEN_CHARFOUND_P(ret)) { 00199 return ENC_CODERANGE_BROKEN; 00200 } 00201 p += MBCLEN_CHARFOUND_LEN(ret); 00202 if (p < e) { 00203 p = search_nonascii(p, e); 00204 if (!p) { 00205 return ENC_CODERANGE_VALID; 00206 } 00207 } 00208 } 00209 if (e < p) { 00210 return ENC_CODERANGE_BROKEN; 00211 } 00212 return ENC_CODERANGE_VALID; 00213 } 00214 00215 while (p < e) { 00216 int ret = rb_enc_precise_mbclen(p, e, enc); 00217 00218 if (!MBCLEN_CHARFOUND_P(ret)) { 00219 return ENC_CODERANGE_BROKEN; 00220 } 00221 p += MBCLEN_CHARFOUND_LEN(ret); 00222 } 00223 if (e < p) { 00224 return ENC_CODERANGE_BROKEN; 00225 } 00226 return ENC_CODERANGE_VALID; 00227 } 00228 00229 long 00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 00231 { 00232 const char *p = s; 00233 00234 if (*cr == ENC_CODERANGE_BROKEN) 00235 return e - s; 00236 00237 if (rb_enc_to_index(enc) == 0) { 00238 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00239 p = search_nonascii(p, e); 00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 00241 return e - s; 00242 } 00243 else if (rb_enc_asciicompat(enc)) { 00244 p = search_nonascii(p, e); 00245 if (!p) { 00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 00247 return e - s; 00248 } 00249 while (p < e) { 00250 int ret = rb_enc_precise_mbclen(p, e, enc); 00251 if (!MBCLEN_CHARFOUND_P(ret)) { 00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00253 return p - s; 00254 } 00255 p += MBCLEN_CHARFOUND_LEN(ret); 00256 if (p < e) { 00257 p = search_nonascii(p, e); 00258 if (!p) { 00259 *cr = ENC_CODERANGE_VALID; 00260 return e - s; 00261 } 00262 } 00263 } 00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00265 return p - s; 00266 } 00267 else { 00268 while (p < e) { 00269 int ret = rb_enc_precise_mbclen(p, e, enc); 00270 if (!MBCLEN_CHARFOUND_P(ret)) { 00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00272 return p - s; 00273 } 00274 p += MBCLEN_CHARFOUND_LEN(ret); 00275 } 00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00277 return p - s; 00278 } 00279 } 00280 00281 static inline void 00282 str_enc_copy(VALUE str1, VALUE str2) 00283 { 00284 rb_enc_set_index(str1, ENCODING_GET(str2)); 00285 } 00286 00287 static void 00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 00289 { 00290 /* this function is designed for copying encoding and coderange 00291 * from src to new string "dest" which is made from the part of src. 00292 */ 00293 str_enc_copy(dest, src); 00294 switch (ENC_CODERANGE(src)) { 00295 case ENC_CODERANGE_7BIT: 00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00297 break; 00298 case ENC_CODERANGE_VALID: 00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00302 else 00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00304 break; 00305 default: 00306 if (RSTRING_LEN(dest) == 0) { 00307 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00309 else 00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00311 } 00312 break; 00313 } 00314 } 00315 00316 static void 00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 00318 { 00319 str_enc_copy(dest, src); 00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 00321 } 00322 00323 int 00324 rb_enc_str_coderange(VALUE str) 00325 { 00326 int cr = ENC_CODERANGE(str); 00327 00328 if (cr == ENC_CODERANGE_UNKNOWN) { 00329 rb_encoding *enc = STR_ENC_GET(str); 00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 00331 ENC_CODERANGE_SET(str, cr); 00332 } 00333 return cr; 00334 } 00335 00336 int 00337 rb_enc_str_asciionly_p(VALUE str) 00338 { 00339 rb_encoding *enc = STR_ENC_GET(str); 00340 00341 if (!rb_enc_asciicompat(enc)) 00342 return FALSE; 00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00344 return TRUE; 00345 return FALSE; 00346 } 00347 00348 static inline void 00349 str_mod_check(VALUE s, const char *p, long len) 00350 { 00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 00352 rb_raise(rb_eRuntimeError, "string modified"); 00353 } 00354 } 00355 00356 size_t 00357 rb_str_capacity(VALUE str) 00358 { 00359 if (STR_EMBED_P(str)) { 00360 return RSTRING_EMBED_LEN_MAX; 00361 } 00362 else if (STR_NOCAPA_P(str)) { 00363 return RSTRING(str)->as.heap.len; 00364 } 00365 else { 00366 return RSTRING(str)->as.heap.aux.capa; 00367 } 00368 } 00369 00370 static inline VALUE 00371 str_alloc(VALUE klass) 00372 { 00373 NEWOBJ(str, struct RString); 00374 OBJSETUP(str, klass, T_STRING); 00375 00376 str->as.heap.ptr = 0; 00377 str->as.heap.len = 0; 00378 str->as.heap.aux.capa = 0; 00379 00380 return (VALUE)str; 00381 } 00382 00383 static VALUE 00384 str_new(VALUE klass, const char *ptr, long len) 00385 { 00386 VALUE str; 00387 00388 if (len < 0) { 00389 rb_raise(rb_eArgError, "negative string size (or size too big)"); 00390 } 00391 00392 str = str_alloc(klass); 00393 if (len > RSTRING_EMBED_LEN_MAX) { 00394 RSTRING(str)->as.heap.aux.capa = len; 00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 00396 STR_SET_NOEMBED(str); 00397 } 00398 else if (len == 0) { 00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 00400 } 00401 if (ptr) { 00402 memcpy(RSTRING_PTR(str), ptr, len); 00403 } 00404 STR_SET_LEN(str, len); 00405 RSTRING_PTR(str)[len] = '\0'; 00406 return str; 00407 } 00408 00409 VALUE 00410 rb_str_new(const char *ptr, long len) 00411 { 00412 return str_new(rb_cString, ptr, len); 00413 } 00414 00415 VALUE 00416 rb_usascii_str_new(const char *ptr, long len) 00417 { 00418 VALUE str = rb_str_new(ptr, len); 00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00420 return str; 00421 } 00422 00423 VALUE 00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 00425 { 00426 VALUE str = rb_str_new(ptr, len); 00427 rb_enc_associate(str, enc); 00428 return str; 00429 } 00430 00431 VALUE 00432 rb_str_new_cstr(const char *ptr) 00433 { 00434 if (!ptr) { 00435 rb_raise(rb_eArgError, "NULL pointer given"); 00436 } 00437 return rb_str_new(ptr, strlen(ptr)); 00438 } 00439 00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 00441 #define rb_str_new2 rb_str_new_cstr 00442 00443 VALUE 00444 rb_usascii_str_new_cstr(const char *ptr) 00445 { 00446 VALUE str = rb_str_new2(ptr); 00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00448 return str; 00449 } 00450 00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr 00453 00454 VALUE 00455 rb_tainted_str_new(const char *ptr, long len) 00456 { 00457 VALUE str = rb_str_new(ptr, len); 00458 00459 OBJ_TAINT(str); 00460 return str; 00461 } 00462 00463 VALUE 00464 rb_tainted_str_new_cstr(const char *ptr) 00465 { 00466 VALUE str = rb_str_new2(ptr); 00467 00468 OBJ_TAINT(str); 00469 return str; 00470 } 00471 00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr 00474 00475 VALUE 00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 00477 { 00478 rb_econv_t *ec; 00479 rb_econv_result_t ret; 00480 long len; 00481 VALUE newstr; 00482 const unsigned char *sp; 00483 unsigned char *dp; 00484 00485 if (!to) return str; 00486 if (from == to) return str; 00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 00488 to == rb_ascii8bit_encoding()) { 00489 if (STR_ENC_GET(str) != to) { 00490 str = rb_str_dup(str); 00491 rb_enc_associate(str, to); 00492 } 00493 return str; 00494 } 00495 00496 len = RSTRING_LEN(str); 00497 newstr = rb_str_new(0, len); 00498 00499 retry: 00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 00501 if (!ec) return str; 00502 00503 sp = (unsigned char*)RSTRING_PTR(str); 00504 dp = (unsigned char*)RSTRING_PTR(newstr); 00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str), 00506 &dp, (unsigned char*)RSTRING_END(newstr), 0); 00507 rb_econv_close(ec); 00508 switch (ret) { 00509 case econv_destination_buffer_full: 00510 /* destination buffer short */ 00511 len = len < 2 ? 2 : len * 2; 00512 rb_str_resize(newstr, len); 00513 goto retry; 00514 00515 case econv_finished: 00516 len = dp - (unsigned char*)RSTRING_PTR(newstr); 00517 rb_str_set_len(newstr, len); 00518 rb_enc_associate(newstr, to); 00519 return newstr; 00520 00521 default: 00522 /* some error, return original */ 00523 return str; 00524 } 00525 } 00526 00527 VALUE 00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 00529 { 00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 00531 } 00532 00533 VALUE 00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 00535 { 00536 VALUE str; 00537 00538 str = rb_tainted_str_new(ptr, len); 00539 if (eenc == rb_usascii_encoding() && 00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 00541 rb_enc_associate(str, rb_ascii8bit_encoding()); 00542 return str; 00543 } 00544 rb_enc_associate(str, eenc); 00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 00546 } 00547 00548 VALUE 00549 rb_external_str_new(const char *ptr, long len) 00550 { 00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 00552 } 00553 00554 VALUE 00555 rb_external_str_new_cstr(const char *ptr) 00556 { 00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 00558 } 00559 00560 VALUE 00561 rb_locale_str_new(const char *ptr, long len) 00562 { 00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 00564 } 00565 00566 VALUE 00567 rb_locale_str_new_cstr(const char *ptr) 00568 { 00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 00570 } 00571 00572 VALUE 00573 rb_filesystem_str_new(const char *ptr, long len) 00574 { 00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 00576 } 00577 00578 VALUE 00579 rb_filesystem_str_new_cstr(const char *ptr) 00580 { 00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 00582 } 00583 00584 VALUE 00585 rb_str_export(VALUE str) 00586 { 00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 00588 } 00589 00590 VALUE 00591 rb_str_export_locale(VALUE str) 00592 { 00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 00594 } 00595 00596 VALUE 00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc) 00598 { 00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 00600 } 00601 00602 static VALUE 00603 str_replace_shared(VALUE str2, VALUE str) 00604 { 00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 00606 STR_SET_EMBED(str2); 00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 00609 } 00610 else { 00611 str = rb_str_new_frozen(str); 00612 FL_SET(str2, STR_NOEMBED); 00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00615 RSTRING(str2)->as.heap.aux.shared = str; 00616 FL_SET(str2, ELTS_SHARED); 00617 } 00618 rb_enc_cr_str_exact_copy(str2, str); 00619 00620 return str2; 00621 } 00622 00623 static VALUE 00624 str_new_shared(VALUE klass, VALUE str) 00625 { 00626 return str_replace_shared(str_alloc(klass), str); 00627 } 00628 00629 static VALUE 00630 str_new3(VALUE klass, VALUE str) 00631 { 00632 return str_new_shared(klass, str); 00633 } 00634 00635 VALUE 00636 rb_str_new_shared(VALUE str) 00637 { 00638 VALUE str2 = str_new3(rb_obj_class(str), str); 00639 00640 OBJ_INFECT(str2, str); 00641 return str2; 00642 } 00643 00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 00645 #define rb_str_new3 rb_str_new_shared 00646 00647 static VALUE 00648 str_new4(VALUE klass, VALUE str) 00649 { 00650 VALUE str2; 00651 00652 str2 = str_alloc(klass); 00653 STR_SET_NOEMBED(str2); 00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00656 if (STR_SHARED_P(str)) { 00657 VALUE shared = RSTRING(str)->as.heap.aux.shared; 00658 assert(OBJ_FROZEN(shared)); 00659 FL_SET(str2, ELTS_SHARED); 00660 RSTRING(str2)->as.heap.aux.shared = shared; 00661 } 00662 else { 00663 FL_SET(str, ELTS_SHARED); 00664 RSTRING(str)->as.heap.aux.shared = str2; 00665 } 00666 rb_enc_cr_str_exact_copy(str2, str); 00667 OBJ_INFECT(str2, str); 00668 return str2; 00669 } 00670 00671 VALUE 00672 rb_str_new_frozen(VALUE orig) 00673 { 00674 VALUE klass, str; 00675 00676 if (OBJ_FROZEN(orig)) return orig; 00677 klass = rb_obj_class(orig); 00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 00679 long ofs; 00680 assert(OBJ_FROZEN(str)); 00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) || 00684 ENCODING_GET(str) != ENCODING_GET(orig)) { 00685 str = str_new3(klass, str); 00686 RSTRING(str)->as.heap.ptr += ofs; 00687 RSTRING(str)->as.heap.len -= ofs; 00688 rb_enc_cr_str_exact_copy(str, orig); 00689 OBJ_INFECT(str, orig); 00690 } 00691 } 00692 else if (STR_EMBED_P(orig)) { 00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 00694 rb_enc_cr_str_exact_copy(str, orig); 00695 OBJ_INFECT(str, orig); 00696 } 00697 else if (STR_ASSOC_P(orig)) { 00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 00699 FL_UNSET(orig, STR_ASSOC); 00700 str = str_new4(klass, orig); 00701 FL_SET(str, STR_ASSOC); 00702 RSTRING(str)->as.heap.aux.shared = assoc; 00703 } 00704 else { 00705 str = str_new4(klass, orig); 00706 } 00707 OBJ_FREEZE(str); 00708 return str; 00709 } 00710 00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 00712 #define rb_str_new4 rb_str_new_frozen 00713 00714 VALUE 00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len) 00716 { 00717 return str_new(rb_obj_class(obj), ptr, len); 00718 } 00719 00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 00721 rb_str_new_with_class, (obj, ptr, len)) 00722 #define rb_str_new5 rb_str_new_with_class 00723 00724 static VALUE 00725 str_new_empty(VALUE str) 00726 { 00727 VALUE v = rb_str_new5(str, 0, 0); 00728 rb_enc_copy(v, str); 00729 OBJ_INFECT(v, str); 00730 return v; 00731 } 00732 00733 #define STR_BUF_MIN_SIZE 128 00734 00735 VALUE 00736 rb_str_buf_new(long capa) 00737 { 00738 VALUE str = str_alloc(rb_cString); 00739 00740 if (capa < STR_BUF_MIN_SIZE) { 00741 capa = STR_BUF_MIN_SIZE; 00742 } 00743 FL_SET(str, STR_NOEMBED); 00744 RSTRING(str)->as.heap.aux.capa = capa; 00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 00746 RSTRING(str)->as.heap.ptr[0] = '\0'; 00747 00748 return str; 00749 } 00750 00751 VALUE 00752 rb_str_buf_new_cstr(const char *ptr) 00753 { 00754 VALUE str; 00755 long len = strlen(ptr); 00756 00757 str = rb_str_buf_new(len); 00758 rb_str_buf_cat(str, ptr, len); 00759 00760 return str; 00761 } 00762 00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 00764 #define rb_str_buf_new2 rb_str_buf_new_cstr 00765 00766 VALUE 00767 rb_str_tmp_new(long len) 00768 { 00769 return str_new(0, 0, len); 00770 } 00771 00772 void * 00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len) 00774 { 00775 VALUE s = rb_str_tmp_new(len); 00776 *store = s; 00777 return RSTRING_PTR(s); 00778 } 00779 00780 void 00781 rb_free_tmp_buffer(volatile VALUE *store) 00782 { 00783 VALUE s = *store; 00784 *store = 0; 00785 if (s) rb_str_clear(s); 00786 } 00787 00788 void 00789 rb_str_free(VALUE str) 00790 { 00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00792 xfree(RSTRING(str)->as.heap.ptr); 00793 } 00794 } 00795 00796 RUBY_FUNC_EXPORTED size_t 00797 rb_str_memsize(VALUE str) 00798 { 00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00800 return RSTRING(str)->as.heap.aux.capa; 00801 } 00802 else { 00803 return 0; 00804 } 00805 } 00806 00807 VALUE 00808 rb_str_to_str(VALUE str) 00809 { 00810 return rb_convert_type(str, T_STRING, "String", "to_str"); 00811 } 00812 00813 static inline void str_discard(VALUE str); 00814 00815 void 00816 rb_str_shared_replace(VALUE str, VALUE str2) 00817 { 00818 rb_encoding *enc; 00819 int cr; 00820 if (str == str2) return; 00821 enc = STR_ENC_GET(str2); 00822 cr = ENC_CODERANGE(str2); 00823 str_discard(str); 00824 OBJ_INFECT(str, str2); 00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 00826 STR_SET_EMBED(str); 00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 00829 rb_enc_associate(str, enc); 00830 ENC_CODERANGE_SET(str, cr); 00831 return; 00832 } 00833 STR_SET_NOEMBED(str); 00834 STR_UNSET_NOCAPA(str); 00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 00837 if (STR_NOCAPA_P(str2)) { 00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 00840 } 00841 else { 00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 00843 } 00844 STR_SET_EMBED(str2); /* abandon str2 */ 00845 RSTRING_PTR(str2)[0] = 0; 00846 STR_SET_EMBED_LEN(str2, 0); 00847 rb_enc_associate(str, enc); 00848 ENC_CODERANGE_SET(str, cr); 00849 } 00850 00851 static ID id_to_s; 00852 00853 VALUE 00854 rb_obj_as_string(VALUE obj) 00855 { 00856 VALUE str; 00857 00858 if (TYPE(obj) == T_STRING) { 00859 return obj; 00860 } 00861 str = rb_funcall(obj, id_to_s, 0); 00862 if (TYPE(str) != T_STRING) 00863 return rb_any_to_s(obj); 00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 00865 return str; 00866 } 00867 00868 static VALUE 00869 str_replace(VALUE str, VALUE str2) 00870 { 00871 long len; 00872 00873 len = RSTRING_LEN(str2); 00874 if (STR_ASSOC_P(str2)) { 00875 str2 = rb_str_new4(str2); 00876 } 00877 if (STR_SHARED_P(str2)) { 00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 00879 assert(OBJ_FROZEN(shared)); 00880 STR_SET_NOEMBED(str); 00881 RSTRING(str)->as.heap.len = len; 00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00883 FL_SET(str, ELTS_SHARED); 00884 FL_UNSET(str, STR_ASSOC); 00885 RSTRING(str)->as.heap.aux.shared = shared; 00886 } 00887 else { 00888 str_replace_shared(str, str2); 00889 } 00890 00891 OBJ_INFECT(str, str2); 00892 rb_enc_cr_str_exact_copy(str, str2); 00893 return str; 00894 } 00895 00896 static VALUE 00897 str_duplicate(VALUE klass, VALUE str) 00898 { 00899 VALUE dup = str_alloc(klass); 00900 str_replace(dup, str); 00901 return dup; 00902 } 00903 00904 VALUE 00905 rb_str_dup(VALUE str) 00906 { 00907 return str_duplicate(rb_obj_class(str), str); 00908 } 00909 00910 VALUE 00911 rb_str_resurrect(VALUE str) 00912 { 00913 return str_replace(str_alloc(rb_cString), str); 00914 } 00915 00916 /* 00917 * call-seq: 00918 * String.new(str="") -> new_str 00919 * 00920 * Returns a new string object containing a copy of <i>str</i>. 00921 */ 00922 00923 static VALUE 00924 rb_str_init(int argc, VALUE *argv, VALUE str) 00925 { 00926 VALUE orig; 00927 00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 00929 rb_str_replace(str, orig); 00930 return str; 00931 } 00932 00933 static inline long 00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 00935 { 00936 long c; 00937 const char *q; 00938 00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00941 } 00942 else if (rb_enc_asciicompat(enc)) { 00943 c = 0; 00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 00945 while (p < e) { 00946 if (ISASCII(*p)) { 00947 q = search_nonascii(p, e); 00948 if (!q) 00949 return c + (e - p); 00950 c += q - p; 00951 p = q; 00952 } 00953 p += rb_enc_fast_mbclen(p, e, enc); 00954 c++; 00955 } 00956 } 00957 else { 00958 while (p < e) { 00959 if (ISASCII(*p)) { 00960 q = search_nonascii(p, e); 00961 if (!q) 00962 return c + (e - p); 00963 c += q - p; 00964 p = q; 00965 } 00966 p += rb_enc_mbclen(p, e, enc); 00967 c++; 00968 } 00969 } 00970 return c; 00971 } 00972 00973 for (c=0; p<e; c++) { 00974 p += rb_enc_mbclen(p, e, enc); 00975 } 00976 return c; 00977 } 00978 00979 long 00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 00981 { 00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 00983 } 00984 00985 long 00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 00987 { 00988 long c; 00989 const char *q; 00990 int ret; 00991 00992 *cr = 0; 00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00995 } 00996 else if (rb_enc_asciicompat(enc)) { 00997 c = 0; 00998 while (p < e) { 00999 if (ISASCII(*p)) { 01000 q = search_nonascii(p, e); 01001 if (!q) { 01002 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01003 return c + (e - p); 01004 } 01005 c += q - p; 01006 p = q; 01007 } 01008 ret = rb_enc_precise_mbclen(p, e, enc); 01009 if (MBCLEN_CHARFOUND_P(ret)) { 01010 *cr |= ENC_CODERANGE_VALID; 01011 p += MBCLEN_CHARFOUND_LEN(ret); 01012 } 01013 else { 01014 *cr = ENC_CODERANGE_BROKEN; 01015 p++; 01016 } 01017 c++; 01018 } 01019 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01020 return c; 01021 } 01022 01023 for (c=0; p<e; c++) { 01024 ret = rb_enc_precise_mbclen(p, e, enc); 01025 if (MBCLEN_CHARFOUND_P(ret)) { 01026 *cr |= ENC_CODERANGE_VALID; 01027 p += MBCLEN_CHARFOUND_LEN(ret); 01028 } 01029 else { 01030 *cr = ENC_CODERANGE_BROKEN; 01031 if (p + rb_enc_mbminlen(enc) <= e) 01032 p += rb_enc_mbminlen(enc); 01033 else 01034 p = e; 01035 } 01036 } 01037 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01038 return c; 01039 } 01040 01041 #ifdef NONASCII_MASK 01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 01043 01044 /* 01045 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 01046 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 01047 * Therefore, following pseudo code can detect UTF-8 leading byte. 01048 * 01049 * if (!(byte & 0x80)) 01050 * byte |= 0x40; // turn on bit6 01051 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 01052 * 01053 * This function calculate every bytes in the argument word `s' 01054 * using the above logic concurrently. and gather every bytes result. 01055 */ 01056 static inline VALUE 01057 count_utf8_lead_bytes_with_word(const VALUE *s) 01058 { 01059 VALUE d = *s; 01060 01061 /* Transform into bit0 represent UTF-8 leading or not. */ 01062 d |= ~(d>>1); 01063 d >>= 6; 01064 d &= NONASCII_MASK >> 7; 01065 01066 /* Gather every bytes. */ 01067 d += (d>>8); 01068 d += (d>>16); 01069 #if SIZEOF_VALUE == 8 01070 d += (d>>32); 01071 #endif 01072 return (d&0xF); 01073 } 01074 #endif 01075 01076 static long 01077 str_strlen(VALUE str, rb_encoding *enc) 01078 { 01079 const char *p, *e; 01080 long n; 01081 int cr; 01082 01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 01084 if (!enc) enc = STR_ENC_GET(str); 01085 p = RSTRING_PTR(str); 01086 e = RSTRING_END(str); 01087 cr = ENC_CODERANGE(str); 01088 #ifdef NONASCII_MASK 01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01090 enc == rb_utf8_encoding()) { 01091 01092 VALUE len = 0; 01093 if ((int)sizeof(VALUE) * 2 < e - p) { 01094 const VALUE *s, *t; 01095 const VALUE lowbits = sizeof(VALUE) - 1; 01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01097 t = (const VALUE*)(~lowbits & (VALUE)e); 01098 while (p < (const char *)s) { 01099 if (is_utf8_lead_byte(*p)) len++; 01100 p++; 01101 } 01102 while (s < t) { 01103 len += count_utf8_lead_bytes_with_word(s); 01104 s++; 01105 } 01106 p = (const char *)s; 01107 } 01108 while (p < e) { 01109 if (is_utf8_lead_byte(*p)) len++; 01110 p++; 01111 } 01112 return (long)len; 01113 } 01114 #endif 01115 n = rb_enc_strlen_cr(p, e, enc, &cr); 01116 if (cr) { 01117 ENC_CODERANGE_SET(str, cr); 01118 } 01119 return n; 01120 } 01121 01122 long 01123 rb_str_strlen(VALUE str) 01124 { 01125 return str_strlen(str, STR_ENC_GET(str)); 01126 } 01127 01128 /* 01129 * call-seq: 01130 * str.length -> integer 01131 * str.size -> integer 01132 * 01133 * Returns the character length of <i>str</i>. 01134 */ 01135 01136 VALUE 01137 rb_str_length(VALUE str) 01138 { 01139 long len; 01140 01141 len = str_strlen(str, STR_ENC_GET(str)); 01142 return LONG2NUM(len); 01143 } 01144 01145 /* 01146 * call-seq: 01147 * str.bytesize -> integer 01148 * 01149 * Returns the length of <i>str</i> in bytes. 01150 */ 01151 01152 static VALUE 01153 rb_str_bytesize(VALUE str) 01154 { 01155 return LONG2NUM(RSTRING_LEN(str)); 01156 } 01157 01158 /* 01159 * call-seq: 01160 * str.empty? -> true or false 01161 * 01162 * Returns <code>true</code> if <i>str</i> has a length of zero. 01163 * 01164 * "hello".empty? #=> false 01165 * "".empty? #=> true 01166 */ 01167 01168 static VALUE 01169 rb_str_empty(VALUE str) 01170 { 01171 if (RSTRING_LEN(str) == 0) 01172 return Qtrue; 01173 return Qfalse; 01174 } 01175 01176 /* 01177 * call-seq: 01178 * str + other_str -> new_str 01179 * 01180 * Concatenation---Returns a new <code>String</code> containing 01181 * <i>other_str</i> concatenated to <i>str</i>. 01182 * 01183 * "Hello from " + self.to_s #=> "Hello from main" 01184 */ 01185 01186 VALUE 01187 rb_str_plus(VALUE str1, VALUE str2) 01188 { 01189 VALUE str3; 01190 rb_encoding *enc; 01191 01192 StringValue(str2); 01193 enc = rb_enc_check(str1, str2); 01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 01197 RSTRING_PTR(str2), RSTRING_LEN(str2)); 01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 01199 01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 01201 OBJ_TAINT(str3); 01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 01204 return str3; 01205 } 01206 01207 /* 01208 * call-seq: 01209 * str * integer -> new_str 01210 * 01211 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of 01212 * the receiver. 01213 * 01214 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 01215 */ 01216 01217 VALUE 01218 rb_str_times(VALUE str, VALUE times) 01219 { 01220 VALUE str2; 01221 long n, len; 01222 char *ptr2; 01223 01224 len = NUM2LONG(times); 01225 if (len < 0) { 01226 rb_raise(rb_eArgError, "negative argument"); 01227 } 01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 01229 rb_raise(rb_eArgError, "argument too big"); 01230 } 01231 01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 01233 ptr2 = RSTRING_PTR(str2); 01234 if (len) { 01235 n = RSTRING_LEN(str); 01236 memcpy(ptr2, RSTRING_PTR(str), n); 01237 while (n <= len/2) { 01238 memcpy(ptr2 + n, ptr2, n); 01239 n *= 2; 01240 } 01241 memcpy(ptr2 + n, ptr2, len-n); 01242 } 01243 ptr2[RSTRING_LEN(str2)] = '\0'; 01244 OBJ_INFECT(str2, str); 01245 rb_enc_cr_str_copy_for_substr(str2, str); 01246 01247 return str2; 01248 } 01249 01250 /* 01251 * call-seq: 01252 * str % arg -> new_str 01253 * 01254 * Format---Uses <i>str</i> as a format specification, and returns the result 01255 * of applying it to <i>arg</i>. If the format specification contains more than 01256 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 01257 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 01258 * details of the format string. 01259 * 01260 * "%05d" % 123 #=> "00123" 01261 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 01262 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 01263 */ 01264 01265 static VALUE 01266 rb_str_format_m(VALUE str, VALUE arg) 01267 { 01268 volatile VALUE tmp = rb_check_array_type(arg); 01269 01270 if (!NIL_P(tmp)) { 01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 01272 } 01273 return rb_str_format(1, &arg, str); 01274 } 01275 01276 static inline void 01277 str_modifiable(VALUE str) 01278 { 01279 if (FL_TEST(str, STR_TMPLOCK)) { 01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 01281 } 01282 rb_check_frozen(str); 01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 01285 } 01286 01287 static inline int 01288 str_independent(VALUE str) 01289 { 01290 str_modifiable(str); 01291 if (!STR_SHARED_P(str)) return 1; 01292 if (STR_EMBED_P(str)) return 1; 01293 return 0; 01294 } 01295 01296 static void 01297 str_make_independent_expand(VALUE str, long expand) 01298 { 01299 char *ptr; 01300 long len = RSTRING_LEN(str); 01301 long capa = len + expand; 01302 01303 if (len > capa) len = capa; 01304 ptr = ALLOC_N(char, capa + 1); 01305 if (RSTRING_PTR(str)) { 01306 memcpy(ptr, RSTRING_PTR(str), len); 01307 } 01308 STR_SET_NOEMBED(str); 01309 STR_UNSET_NOCAPA(str); 01310 ptr[len] = 0; 01311 RSTRING(str)->as.heap.ptr = ptr; 01312 RSTRING(str)->as.heap.len = len; 01313 RSTRING(str)->as.heap.aux.capa = capa; 01314 } 01315 01316 #define str_make_independent(str) str_make_independent_expand((str), 0L) 01317 01318 void 01319 rb_str_modify(VALUE str) 01320 { 01321 if (!str_independent(str)) 01322 str_make_independent(str); 01323 ENC_CODERANGE_CLEAR(str); 01324 } 01325 01326 void 01327 rb_str_modify_expand(VALUE str, long expand) 01328 { 01329 if (expand < 0) { 01330 rb_raise(rb_eArgError, "negative expanding string size"); 01331 } 01332 if (!str_independent(str)) { 01333 str_make_independent_expand(str, expand); 01334 } 01335 else if (expand > 0) { 01336 long len = RSTRING_LEN(str); 01337 long capa = len + expand; 01338 if (!STR_EMBED_P(str)) { 01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 01340 RSTRING(str)->as.heap.aux.capa = capa; 01341 } 01342 else if (capa > RSTRING_EMBED_LEN_MAX) { 01343 str_make_independent_expand(str, expand); 01344 } 01345 } 01346 ENC_CODERANGE_CLEAR(str); 01347 } 01348 01349 /* As rb_str_modify(), but don't clear coderange */ 01350 static void 01351 str_modify_keep_cr(VALUE str) 01352 { 01353 if (!str_independent(str)) 01354 str_make_independent(str); 01355 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 01356 /* Force re-scan later */ 01357 ENC_CODERANGE_CLEAR(str); 01358 } 01359 01360 static inline void 01361 str_discard(VALUE str) 01362 { 01363 str_modifiable(str); 01364 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 01365 xfree(RSTRING_PTR(str)); 01366 RSTRING(str)->as.heap.ptr = 0; 01367 RSTRING(str)->as.heap.len = 0; 01368 } 01369 } 01370 01371 void 01372 rb_str_associate(VALUE str, VALUE add) 01373 { 01374 /* sanity check */ 01375 rb_check_frozen(str); 01376 if (STR_ASSOC_P(str)) { 01377 /* already associated */ 01378 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 01379 } 01380 else { 01381 if (STR_SHARED_P(str)) { 01382 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 01383 str_make_independent(str); 01384 if (STR_ASSOC_P(assoc)) { 01385 assoc = RSTRING(assoc)->as.heap.aux.shared; 01386 rb_ary_concat(assoc, add); 01387 add = assoc; 01388 } 01389 } 01390 else if (STR_EMBED_P(str)) { 01391 str_make_independent(str); 01392 } 01393 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 01394 RESIZE_CAPA(str, RSTRING_LEN(str)); 01395 } 01396 FL_SET(str, STR_ASSOC); 01397 RBASIC(add)->klass = 0; 01398 RSTRING(str)->as.heap.aux.shared = add; 01399 } 01400 } 01401 01402 VALUE 01403 rb_str_associated(VALUE str) 01404 { 01405 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 01406 if (STR_ASSOC_P(str)) { 01407 return RSTRING(str)->as.heap.aux.shared; 01408 } 01409 return Qfalse; 01410 } 01411 01412 VALUE 01413 rb_string_value(volatile VALUE *ptr) 01414 { 01415 VALUE s = *ptr; 01416 if (TYPE(s) != T_STRING) { 01417 s = rb_str_to_str(s); 01418 *ptr = s; 01419 } 01420 return s; 01421 } 01422 01423 char * 01424 rb_string_value_ptr(volatile VALUE *ptr) 01425 { 01426 VALUE str = rb_string_value(ptr); 01427 return RSTRING_PTR(str); 01428 } 01429 01430 char * 01431 rb_string_value_cstr(volatile VALUE *ptr) 01432 { 01433 VALUE str = rb_string_value(ptr); 01434 char *s = RSTRING_PTR(str); 01435 long len = RSTRING_LEN(str); 01436 01437 if (!s || memchr(s, 0, len)) { 01438 rb_raise(rb_eArgError, "string contains null byte"); 01439 } 01440 if (s[len]) { 01441 rb_str_modify(str); 01442 s = RSTRING_PTR(str); 01443 s[RSTRING_LEN(str)] = 0; 01444 } 01445 return s; 01446 } 01447 01448 VALUE 01449 rb_check_string_type(VALUE str) 01450 { 01451 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 01452 return str; 01453 } 01454 01455 /* 01456 * call-seq: 01457 * String.try_convert(obj) -> string or nil 01458 * 01459 * Try to convert <i>obj</i> into a String, using to_str method. 01460 * Returns converted string or nil if <i>obj</i> cannot be converted 01461 * for any reason. 01462 * 01463 * String.try_convert("str") #=> "str" 01464 * String.try_convert(/re/) #=> nil 01465 */ 01466 static VALUE 01467 rb_str_s_try_convert(VALUE dummy, VALUE str) 01468 { 01469 return rb_check_string_type(str); 01470 } 01471 01472 static char* 01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 01474 { 01475 long nth = *nthp; 01476 if (rb_enc_mbmaxlen(enc) == 1) { 01477 p += nth; 01478 } 01479 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01480 p += nth * rb_enc_mbmaxlen(enc); 01481 } 01482 else if (rb_enc_asciicompat(enc)) { 01483 const char *p2, *e2; 01484 int n; 01485 01486 while (p < e && 0 < nth) { 01487 e2 = p + nth; 01488 if (e < e2) { 01489 *nthp = nth; 01490 return (char *)e; 01491 } 01492 if (ISASCII(*p)) { 01493 p2 = search_nonascii(p, e2); 01494 if (!p2) { 01495 *nthp = nth; 01496 return (char *)e2; 01497 } 01498 nth -= p2 - p; 01499 p = p2; 01500 } 01501 n = rb_enc_mbclen(p, e, enc); 01502 p += n; 01503 nth--; 01504 } 01505 *nthp = nth; 01506 if (nth != 0) { 01507 return (char *)e; 01508 } 01509 return (char *)p; 01510 } 01511 else { 01512 while (p < e && nth--) { 01513 p += rb_enc_mbclen(p, e, enc); 01514 } 01515 } 01516 if (p > e) p = e; 01517 *nthp = nth; 01518 return (char*)p; 01519 } 01520 01521 char* 01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 01523 { 01524 return str_nth_len(p, e, &nth, enc); 01525 } 01526 01527 static char* 01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01529 { 01530 if (singlebyte) 01531 p += nth; 01532 else { 01533 p = str_nth_len(p, e, &nth, enc); 01534 } 01535 if (!p) return 0; 01536 if (p > e) p = e; 01537 return (char *)p; 01538 } 01539 01540 /* char offset to byte offset */ 01541 static long 01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01543 { 01544 const char *pp = str_nth(p, e, nth, enc, singlebyte); 01545 if (!pp) return e - p; 01546 return pp - p; 01547 } 01548 01549 long 01550 rb_str_offset(VALUE str, long pos) 01551 { 01552 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 01553 STR_ENC_GET(str), single_byte_optimizable(str)); 01554 } 01555 01556 #ifdef NONASCII_MASK 01557 static char * 01558 str_utf8_nth(const char *p, const char *e, long *nthp) 01559 { 01560 long nth = *nthp; 01561 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 01562 const VALUE *s, *t; 01563 const VALUE lowbits = sizeof(VALUE) - 1; 01564 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01565 t = (const VALUE*)(~lowbits & (VALUE)e); 01566 while (p < (const char *)s) { 01567 if (is_utf8_lead_byte(*p)) nth--; 01568 p++; 01569 } 01570 do { 01571 nth -= count_utf8_lead_bytes_with_word(s); 01572 s++; 01573 } while (s < t && (int)sizeof(VALUE) <= nth); 01574 p = (char *)s; 01575 } 01576 while (p < e) { 01577 if (is_utf8_lead_byte(*p)) { 01578 if (nth == 0) break; 01579 nth--; 01580 } 01581 p++; 01582 } 01583 *nthp = nth; 01584 return (char *)p; 01585 } 01586 01587 static long 01588 str_utf8_offset(const char *p, const char *e, long nth) 01589 { 01590 const char *pp = str_utf8_nth(p, e, &nth); 01591 return pp - p; 01592 } 01593 #endif 01594 01595 /* byte offset to char offset */ 01596 long 01597 rb_str_sublen(VALUE str, long pos) 01598 { 01599 if (single_byte_optimizable(str) || pos < 0) 01600 return pos; 01601 else { 01602 char *p = RSTRING_PTR(str); 01603 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 01604 } 01605 } 01606 01607 VALUE 01608 rb_str_subseq(VALUE str, long beg, long len) 01609 { 01610 VALUE str2; 01611 01612 if (RSTRING_LEN(str) == beg + len && 01613 RSTRING_EMBED_LEN_MAX < len) { 01614 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 01615 rb_str_drop_bytes(str2, beg); 01616 } 01617 else { 01618 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 01619 } 01620 01621 rb_enc_cr_str_copy_for_substr(str2, str); 01622 OBJ_INFECT(str2, str); 01623 01624 return str2; 01625 } 01626 01627 VALUE 01628 rb_str_substr(VALUE str, long beg, long len) 01629 { 01630 rb_encoding *enc = STR_ENC_GET(str); 01631 VALUE str2; 01632 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str); 01633 01634 if (len < 0) return Qnil; 01635 if (!RSTRING_LEN(str)) { 01636 len = 0; 01637 } 01638 if (single_byte_optimizable(str)) { 01639 if (beg > RSTRING_LEN(str)) return Qnil; 01640 if (beg < 0) { 01641 beg += RSTRING_LEN(str); 01642 if (beg < 0) return Qnil; 01643 } 01644 if (beg + len > RSTRING_LEN(str)) 01645 len = RSTRING_LEN(str) - beg; 01646 if (len <= 0) { 01647 len = 0; 01648 p = 0; 01649 } 01650 else 01651 p = s + beg; 01652 goto sub; 01653 } 01654 if (beg < 0) { 01655 if (len > -beg) len = -beg; 01656 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 01657 beg = -beg; 01658 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 01659 p = e; 01660 if (!p) return Qnil; 01661 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 01662 if (!p) return Qnil; 01663 len = e - p; 01664 goto sub; 01665 } 01666 else { 01667 beg += str_strlen(str, enc); 01668 if (beg < 0) return Qnil; 01669 } 01670 } 01671 else if (beg > 0 && beg > RSTRING_LEN(str)) { 01672 return Qnil; 01673 } 01674 if (len == 0) { 01675 if (beg > str_strlen(str, enc)) return Qnil; 01676 p = 0; 01677 } 01678 #ifdef NONASCII_MASK 01679 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01680 enc == rb_utf8_encoding()) { 01681 p = str_utf8_nth(s, e, &beg); 01682 if (beg > 0) return Qnil; 01683 len = str_utf8_offset(p, e, len); 01684 } 01685 #endif 01686 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01687 int char_sz = rb_enc_mbmaxlen(enc); 01688 01689 p = s + beg * char_sz; 01690 if (p > e) { 01691 return Qnil; 01692 } 01693 else if (len * char_sz > e - p) 01694 len = e - p; 01695 else 01696 len *= char_sz; 01697 } 01698 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 01699 if (beg > 0) return Qnil; 01700 len = 0; 01701 } 01702 else { 01703 len = str_offset(p, e, len, enc, 0); 01704 } 01705 sub: 01706 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) { 01707 str2 = rb_str_new4(str); 01708 str2 = str_new3(rb_obj_class(str2), str2); 01709 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 01710 RSTRING(str2)->as.heap.len = len; 01711 } 01712 else { 01713 str2 = rb_str_new5(str, p, len); 01714 rb_enc_cr_str_copy_for_substr(str2, str); 01715 OBJ_INFECT(str2, str); 01716 } 01717 01718 return str2; 01719 } 01720 01721 VALUE 01722 rb_str_freeze(VALUE str) 01723 { 01724 if (STR_ASSOC_P(str)) { 01725 VALUE ary = RSTRING(str)->as.heap.aux.shared; 01726 OBJ_FREEZE(ary); 01727 } 01728 return rb_obj_freeze(str); 01729 } 01730 01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 01732 #define rb_str_dup_frozen rb_str_new_frozen 01733 01734 VALUE 01735 rb_str_locktmp(VALUE str) 01736 { 01737 if (FL_TEST(str, STR_TMPLOCK)) { 01738 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 01739 } 01740 FL_SET(str, STR_TMPLOCK); 01741 return str; 01742 } 01743 01744 VALUE 01745 rb_str_unlocktmp(VALUE str) 01746 { 01747 if (!FL_TEST(str, STR_TMPLOCK)) { 01748 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 01749 } 01750 FL_UNSET(str, STR_TMPLOCK); 01751 return str; 01752 } 01753 01754 void 01755 rb_str_set_len(VALUE str, long len) 01756 { 01757 long capa; 01758 01759 str_modifiable(str); 01760 if (STR_SHARED_P(str)) { 01761 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 01762 } 01763 if (len > (capa = (long)rb_str_capacity(str))) { 01764 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 01765 } 01766 STR_SET_LEN(str, len); 01767 RSTRING_PTR(str)[len] = '\0'; 01768 } 01769 01770 VALUE 01771 rb_str_resize(VALUE str, long len) 01772 { 01773 long slen; 01774 int independent; 01775 01776 if (len < 0) { 01777 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01778 } 01779 01780 independent = str_independent(str); 01781 ENC_CODERANGE_CLEAR(str); 01782 slen = RSTRING_LEN(str); 01783 if (len != slen) { 01784 if (STR_EMBED_P(str)) { 01785 if (len <= RSTRING_EMBED_LEN_MAX) { 01786 STR_SET_EMBED_LEN(str, len); 01787 RSTRING(str)->as.ary[len] = '\0'; 01788 return str; 01789 } 01790 str_make_independent_expand(str, len - slen); 01791 STR_SET_NOEMBED(str); 01792 } 01793 else if (len <= RSTRING_EMBED_LEN_MAX) { 01794 char *ptr = RSTRING(str)->as.heap.ptr; 01795 STR_SET_EMBED(str); 01796 if (slen > len) slen = len; 01797 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 01798 RSTRING(str)->as.ary[len] = '\0'; 01799 STR_SET_EMBED_LEN(str, len); 01800 if (independent) xfree(ptr); 01801 return str; 01802 } 01803 else if (!independent) { 01804 str_make_independent_expand(str, len - slen); 01805 } 01806 else if (slen < len || slen - len > 1024) { 01807 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 01808 } 01809 if (!STR_NOCAPA_P(str)) { 01810 RSTRING(str)->as.heap.aux.capa = len; 01811 } 01812 RSTRING(str)->as.heap.len = len; 01813 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 01814 } 01815 return str; 01816 } 01817 01818 static VALUE 01819 str_buf_cat(VALUE str, const char *ptr, long len) 01820 { 01821 long capa, total, off = -1; 01822 01823 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 01824 off = ptr - RSTRING_PTR(str); 01825 } 01826 rb_str_modify(str); 01827 if (len == 0) return 0; 01828 if (STR_ASSOC_P(str)) { 01829 FL_UNSET(str, STR_ASSOC); 01830 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 01831 } 01832 else if (STR_EMBED_P(str)) { 01833 capa = RSTRING_EMBED_LEN_MAX; 01834 } 01835 else { 01836 capa = RSTRING(str)->as.heap.aux.capa; 01837 } 01838 if (RSTRING_LEN(str) >= LONG_MAX - len) { 01839 rb_raise(rb_eArgError, "string sizes too big"); 01840 } 01841 total = RSTRING_LEN(str)+len; 01842 if (capa <= total) { 01843 while (total > capa) { 01844 if (capa + 1 >= LONG_MAX / 2) { 01845 capa = (total + 4095) / 4096; 01846 break; 01847 } 01848 capa = (capa + 1) * 2; 01849 } 01850 RESIZE_CAPA(str, capa); 01851 } 01852 if (off != -1) { 01853 ptr = RSTRING_PTR(str) + off; 01854 } 01855 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 01856 STR_SET_LEN(str, total); 01857 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 01858 01859 return str; 01860 } 01861 01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 01863 01864 VALUE 01865 rb_str_buf_cat(VALUE str, const char *ptr, long len) 01866 { 01867 if (len == 0) return str; 01868 if (len < 0) { 01869 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01870 } 01871 return str_buf_cat(str, ptr, len); 01872 } 01873 01874 VALUE 01875 rb_str_buf_cat2(VALUE str, const char *ptr) 01876 { 01877 return rb_str_buf_cat(str, ptr, strlen(ptr)); 01878 } 01879 01880 VALUE 01881 rb_str_cat(VALUE str, const char *ptr, long len) 01882 { 01883 if (len < 0) { 01884 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01885 } 01886 if (STR_ASSOC_P(str)) { 01887 char *p; 01888 rb_str_modify_expand(str, len); 01889 p = RSTRING(str)->as.heap.ptr; 01890 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 01891 len = RSTRING(str)->as.heap.len += len; 01892 p[len] = '\0'; /* sentinel */ 01893 return str; 01894 } 01895 01896 return rb_str_buf_cat(str, ptr, len); 01897 } 01898 01899 VALUE 01900 rb_str_cat2(VALUE str, const char *ptr) 01901 { 01902 return rb_str_cat(str, ptr, strlen(ptr)); 01903 } 01904 01905 static VALUE 01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 01907 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 01908 { 01909 int str_encindex = ENCODING_GET(str); 01910 int res_encindex; 01911 int str_cr, res_cr; 01912 01913 str_cr = ENC_CODERANGE(str); 01914 01915 if (str_encindex == ptr_encindex) { 01916 if (str_cr == ENC_CODERANGE_UNKNOWN) 01917 ptr_cr = ENC_CODERANGE_UNKNOWN; 01918 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01919 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 01920 } 01921 } 01922 else { 01923 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 01924 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 01925 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 01926 if (len == 0) 01927 return str; 01928 if (RSTRING_LEN(str) == 0) { 01929 rb_str_buf_cat(str, ptr, len); 01930 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 01931 return str; 01932 } 01933 goto incompatible; 01934 } 01935 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01936 ptr_cr = coderange_scan(ptr, len, ptr_enc); 01937 } 01938 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01939 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 01940 str_cr = rb_enc_str_coderange(str); 01941 } 01942 } 01943 } 01944 if (ptr_cr_ret) 01945 *ptr_cr_ret = ptr_cr; 01946 01947 if (str_encindex != ptr_encindex && 01948 str_cr != ENC_CODERANGE_7BIT && 01949 ptr_cr != ENC_CODERANGE_7BIT) { 01950 incompatible: 01951 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 01952 rb_enc_name(rb_enc_from_index(str_encindex)), 01953 rb_enc_name(rb_enc_from_index(ptr_encindex))); 01954 } 01955 01956 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01957 res_encindex = str_encindex; 01958 res_cr = ENC_CODERANGE_UNKNOWN; 01959 } 01960 else if (str_cr == ENC_CODERANGE_7BIT) { 01961 if (ptr_cr == ENC_CODERANGE_7BIT) { 01962 res_encindex = str_encindex; 01963 res_cr = ENC_CODERANGE_7BIT; 01964 } 01965 else { 01966 res_encindex = ptr_encindex; 01967 res_cr = ptr_cr; 01968 } 01969 } 01970 else if (str_cr == ENC_CODERANGE_VALID) { 01971 res_encindex = str_encindex; 01972 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 01973 res_cr = str_cr; 01974 else 01975 res_cr = ptr_cr; 01976 } 01977 else { /* str_cr == ENC_CODERANGE_BROKEN */ 01978 res_encindex = str_encindex; 01979 res_cr = str_cr; 01980 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 01981 } 01982 01983 if (len < 0) { 01984 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01985 } 01986 str_buf_cat(str, ptr, len); 01987 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 01988 return str; 01989 } 01990 01991 VALUE 01992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 01993 { 01994 return rb_enc_cr_str_buf_cat(str, ptr, len, 01995 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 01996 } 01997 01998 VALUE 01999 rb_str_buf_cat_ascii(VALUE str, const char *ptr) 02000 { 02001 /* ptr must reference NUL terminated ASCII string. */ 02002 int encindex = ENCODING_GET(str); 02003 rb_encoding *enc = rb_enc_from_index(encindex); 02004 if (rb_enc_asciicompat(enc)) { 02005 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 02006 encindex, ENC_CODERANGE_7BIT, 0); 02007 } 02008 else { 02009 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 02010 while (*ptr) { 02011 unsigned int c = (unsigned char)*ptr; 02012 int len = rb_enc_codelen(c, enc); 02013 rb_enc_mbcput(c, buf, enc); 02014 rb_enc_cr_str_buf_cat(str, buf, len, 02015 encindex, ENC_CODERANGE_VALID, 0); 02016 ptr++; 02017 } 02018 return str; 02019 } 02020 } 02021 02022 VALUE 02023 rb_str_buf_append(VALUE str, VALUE str2) 02024 { 02025 int str2_cr; 02026 02027 str2_cr = ENC_CODERANGE(str2); 02028 02029 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 02030 ENCODING_GET(str2), str2_cr, &str2_cr); 02031 02032 OBJ_INFECT(str, str2); 02033 ENC_CODERANGE_SET(str2, str2_cr); 02034 02035 return str; 02036 } 02037 02038 VALUE 02039 rb_str_append(VALUE str, VALUE str2) 02040 { 02041 rb_encoding *enc; 02042 int cr, cr2; 02043 long len2; 02044 02045 StringValue(str2); 02046 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 02047 long len = RSTRING_LEN(str) + len2; 02048 enc = rb_enc_check(str, str2); 02049 cr = ENC_CODERANGE(str); 02050 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 02051 rb_str_modify_expand(str, len2); 02052 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 02053 RSTRING_PTR(str2), len2+1); 02054 RSTRING(str)->as.heap.len = len; 02055 rb_enc_associate(str, enc); 02056 ENC_CODERANGE_SET(str, cr); 02057 OBJ_INFECT(str, str2); 02058 return str; 02059 } 02060 return rb_str_buf_append(str, str2); 02061 } 02062 02063 /* 02064 * call-seq: 02065 * str << integer -> str 02066 * str.concat(integer) -> str 02067 * str << obj -> str 02068 * str.concat(obj) -> str 02069 * 02070 * Append---Concatenates the given object to <i>str</i>. If the object is a 02071 * <code>Integer</code>, it is considered as a codepoint, and is converted 02072 * to a character before concatenation. 02073 * 02074 * a = "hello " 02075 * a << "world" #=> "hello world" 02076 * a.concat(33) #=> "hello world!" 02077 */ 02078 02079 VALUE 02080 rb_str_concat(VALUE str1, VALUE str2) 02081 { 02082 unsigned int code; 02083 rb_encoding *enc = STR_ENC_GET(str1); 02084 02085 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) { 02086 if (rb_num_to_uint(str2, &code) == 0) { 02087 } 02088 else if (FIXNUM_P(str2)) { 02089 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 02090 } 02091 else { 02092 rb_raise(rb_eRangeError, "bignum out of char range"); 02093 } 02094 } 02095 else { 02096 return rb_str_append(str1, str2); 02097 } 02098 02099 if (enc == rb_usascii_encoding()) { 02100 /* US-ASCII automatically extended to ASCII-8BIT */ 02101 char buf[1]; 02102 buf[0] = (char)code; 02103 if (code > 0xFF) { 02104 rb_raise(rb_eRangeError, "%u out of char range", code); 02105 } 02106 rb_str_cat(str1, buf, 1); 02107 if (code > 127) { 02108 rb_enc_associate(str1, rb_ascii8bit_encoding()); 02109 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 02110 } 02111 } 02112 else { 02113 long pos = RSTRING_LEN(str1); 02114 int cr = ENC_CODERANGE(str1); 02115 int len; 02116 char *buf; 02117 02118 switch (len = rb_enc_codelen(code, enc)) { 02119 case ONIGERR_INVALID_CODE_POINT_VALUE: 02120 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02121 break; 02122 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 02123 case 0: 02124 rb_raise(rb_eRangeError, "%u out of char range", code); 02125 break; 02126 } 02127 buf = ALLOCA_N(char, len + 1); 02128 rb_enc_mbcput(code, buf, enc); 02129 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 02130 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02131 } 02132 rb_str_resize(str1, pos+len); 02133 strncpy(RSTRING_PTR(str1) + pos, buf, len); 02134 if (cr == ENC_CODERANGE_7BIT && code > 127) 02135 cr = ENC_CODERANGE_VALID; 02136 ENC_CODERANGE_SET(str1, cr); 02137 } 02138 return str1; 02139 } 02140 02141 /* 02142 * call-seq: 02143 * str.prepend(other_str) -> str 02144 * 02145 * Prepend---Prepend the given string to <i>str</i>. 02146 * 02147 * a = "world" 02148 * a.prepend("hello ") #=> "hello world" 02149 * a #=> "hello world" 02150 */ 02151 02152 static VALUE 02153 rb_str_prepend(VALUE str, VALUE str2) 02154 { 02155 StringValue(str2); 02156 StringValue(str); 02157 rb_str_update(str, 0L, 0L, str2); 02158 return str; 02159 } 02160 02161 st_index_t 02162 rb_str_hash(VALUE str) 02163 { 02164 int e = ENCODING_GET(str); 02165 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02166 e = 0; 02167 } 02168 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 02169 } 02170 02171 int 02172 rb_str_hash_cmp(VALUE str1, VALUE str2) 02173 { 02174 long len; 02175 02176 if (!rb_str_comparable(str1, str2)) return 1; 02177 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 02178 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 02179 return 0; 02180 } 02181 return 1; 02182 } 02183 02184 /* 02185 * call-seq: 02186 * str.hash -> fixnum 02187 * 02188 * Return a hash based on the string's length and content. 02189 */ 02190 02191 static VALUE 02192 rb_str_hash_m(VALUE str) 02193 { 02194 st_index_t hval = rb_str_hash(str); 02195 return INT2FIX(hval); 02196 } 02197 02198 #define lesser(a,b) (((a)>(b))?(b):(a)) 02199 02200 int 02201 rb_str_comparable(VALUE str1, VALUE str2) 02202 { 02203 int idx1, idx2; 02204 int rc1, rc2; 02205 02206 if (RSTRING_LEN(str1) == 0) return TRUE; 02207 if (RSTRING_LEN(str2) == 0) return TRUE; 02208 idx1 = ENCODING_GET(str1); 02209 idx2 = ENCODING_GET(str2); 02210 if (idx1 == idx2) return TRUE; 02211 rc1 = rb_enc_str_coderange(str1); 02212 rc2 = rb_enc_str_coderange(str2); 02213 if (rc1 == ENC_CODERANGE_7BIT) { 02214 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 02215 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 02216 return TRUE; 02217 } 02218 if (rc2 == ENC_CODERANGE_7BIT) { 02219 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 02220 return TRUE; 02221 } 02222 return FALSE; 02223 } 02224 02225 int 02226 rb_str_cmp(VALUE str1, VALUE str2) 02227 { 02228 long len1, len2; 02229 const char *ptr1, *ptr2; 02230 int retval; 02231 02232 if (str1 == str2) return 0; 02233 RSTRING_GETMEM(str1, ptr1, len1); 02234 RSTRING_GETMEM(str2, ptr2, len2); 02235 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 02236 if (len1 == len2) { 02237 if (!rb_str_comparable(str1, str2)) { 02238 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 02239 return 1; 02240 return -1; 02241 } 02242 return 0; 02243 } 02244 if (len1 > len2) return 1; 02245 return -1; 02246 } 02247 if (retval > 0) return 1; 02248 return -1; 02249 } 02250 02251 /* expect tail call optimization */ 02252 static VALUE 02253 str_eql(const VALUE str1, const VALUE str2) 02254 { 02255 const long len = RSTRING_LEN(str1); 02256 const char *ptr1, *ptr2; 02257 02258 if (len != RSTRING_LEN(str2)) return Qfalse; 02259 if (!rb_str_comparable(str1, str2)) return Qfalse; 02260 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 02261 return Qtrue; 02262 if (memcmp(ptr1, ptr2, len) == 0) 02263 return Qtrue; 02264 return Qfalse; 02265 } 02266 /* 02267 * call-seq: 02268 * str == obj -> true or false 02269 * 02270 * Equality---If <i>obj</i> is not a <code>String</code>, returns 02271 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 02272 * <code><=></code> <i>obj</i> returns zero. 02273 */ 02274 02275 VALUE 02276 rb_str_equal(VALUE str1, VALUE str2) 02277 { 02278 if (str1 == str2) return Qtrue; 02279 if (TYPE(str2) != T_STRING) { 02280 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02281 return Qfalse; 02282 } 02283 return rb_equal(str2, str1); 02284 } 02285 return str_eql(str1, str2); 02286 } 02287 02288 /* 02289 * call-seq: 02290 * str.eql?(other) -> true or false 02291 * 02292 * Two strings are equal if they have the same length and content. 02293 */ 02294 02295 static VALUE 02296 rb_str_eql(VALUE str1, VALUE str2) 02297 { 02298 if (str1 == str2) return Qtrue; 02299 if (TYPE(str2) != T_STRING) return Qfalse; 02300 return str_eql(str1, str2); 02301 } 02302 02303 /* 02304 * call-seq: 02305 * str <=> other_str -> -1, 0, +1 or nil 02306 * 02307 * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if 02308 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than 02309 * <i>str</i>. If the strings are of different lengths, and the strings are 02310 * equal when compared up to the shortest length, then the longer string is 02311 * considered greater than the shorter one. In older versions of Ruby, setting 02312 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated 02313 * in favor of using <code>String#casecmp</code>. 02314 * 02315 * <code><=></code> is the basis for the methods <code><</code>, 02316 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>, 02317 * included from module <code>Comparable</code>. The method 02318 * <code>String#==</code> does not use <code>Comparable#==</code>. 02319 * 02320 * "abcdef" <=> "abcde" #=> 1 02321 * "abcdef" <=> "abcdef" #=> 0 02322 * "abcdef" <=> "abcdefg" #=> -1 02323 * "abcdef" <=> "ABCDEF" #=> 1 02324 */ 02325 02326 static VALUE 02327 rb_str_cmp_m(VALUE str1, VALUE str2) 02328 { 02329 long result; 02330 02331 if (TYPE(str2) != T_STRING) { 02332 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02333 return Qnil; 02334 } 02335 else if (!rb_respond_to(str2, rb_intern("<=>"))) { 02336 return Qnil; 02337 } 02338 else { 02339 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1); 02340 02341 if (NIL_P(tmp)) return Qnil; 02342 if (!FIXNUM_P(tmp)) { 02343 return rb_funcall(LONG2FIX(0), '-', 1, tmp); 02344 } 02345 result = -FIX2LONG(tmp); 02346 } 02347 } 02348 else { 02349 result = rb_str_cmp(str1, str2); 02350 } 02351 return LONG2NUM(result); 02352 } 02353 02354 /* 02355 * call-seq: 02356 * str.casecmp(other_str) -> -1, 0, +1 or nil 02357 * 02358 * Case-insensitive version of <code>String#<=></code>. 02359 * 02360 * "abcdef".casecmp("abcde") #=> 1 02361 * "aBcDeF".casecmp("abcdef") #=> 0 02362 * "abcdef".casecmp("abcdefg") #=> -1 02363 * "abcdef".casecmp("ABCDEF") #=> 0 02364 */ 02365 02366 static VALUE 02367 rb_str_casecmp(VALUE str1, VALUE str2) 02368 { 02369 long len; 02370 rb_encoding *enc; 02371 char *p1, *p1end, *p2, *p2end; 02372 02373 StringValue(str2); 02374 enc = rb_enc_compatible(str1, str2); 02375 if (!enc) { 02376 return Qnil; 02377 } 02378 02379 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 02380 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 02381 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 02382 while (p1 < p1end && p2 < p2end) { 02383 if (*p1 != *p2) { 02384 unsigned int c1 = TOUPPER(*p1 & 0xff); 02385 unsigned int c2 = TOUPPER(*p2 & 0xff); 02386 if (c1 != c2) 02387 return INT2FIX(c1 < c2 ? -1 : 1); 02388 } 02389 p1++; 02390 p2++; 02391 } 02392 } 02393 else { 02394 while (p1 < p1end && p2 < p2end) { 02395 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 02396 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 02397 02398 if (0 <= c1 && 0 <= c2) { 02399 c1 = TOUPPER(c1); 02400 c2 = TOUPPER(c2); 02401 if (c1 != c2) 02402 return INT2FIX(c1 < c2 ? -1 : 1); 02403 } 02404 else { 02405 int r; 02406 l1 = rb_enc_mbclen(p1, p1end, enc); 02407 l2 = rb_enc_mbclen(p2, p2end, enc); 02408 len = l1 < l2 ? l1 : l2; 02409 r = memcmp(p1, p2, len); 02410 if (r != 0) 02411 return INT2FIX(r < 0 ? -1 : 1); 02412 if (l1 != l2) 02413 return INT2FIX(l1 < l2 ? -1 : 1); 02414 } 02415 p1 += l1; 02416 p2 += l2; 02417 } 02418 } 02419 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 02420 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 02421 return INT2FIX(-1); 02422 } 02423 02424 static long 02425 rb_str_index(VALUE str, VALUE sub, long offset) 02426 { 02427 long pos; 02428 char *s, *sptr, *e; 02429 long len, slen; 02430 rb_encoding *enc; 02431 02432 enc = rb_enc_check(str, sub); 02433 if (is_broken_string(sub)) { 02434 return -1; 02435 } 02436 len = str_strlen(str, enc); 02437 slen = str_strlen(sub, enc); 02438 if (offset < 0) { 02439 offset += len; 02440 if (offset < 0) return -1; 02441 } 02442 if (len - offset < slen) return -1; 02443 s = RSTRING_PTR(str); 02444 e = s + RSTRING_LEN(str); 02445 if (offset) { 02446 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 02447 s += offset; 02448 } 02449 if (slen == 0) return offset; 02450 /* need proceed one character at a time */ 02451 sptr = RSTRING_PTR(sub); 02452 slen = RSTRING_LEN(sub); 02453 len = RSTRING_LEN(str) - offset; 02454 for (;;) { 02455 char *t; 02456 pos = rb_memsearch(sptr, slen, s, len, enc); 02457 if (pos < 0) return pos; 02458 t = rb_enc_right_char_head(s, s+pos, e, enc); 02459 if (t == s + pos) break; 02460 if ((len -= t - s) <= 0) return -1; 02461 offset += t - s; 02462 s = t; 02463 } 02464 return pos + offset; 02465 } 02466 02467 02468 /* 02469 * call-seq: 02470 * str.index(substring [, offset]) -> fixnum or nil 02471 * str.index(regexp [, offset]) -> fixnum or nil 02472 * 02473 * Returns the index of the first occurrence of the given <i>substring</i> or 02474 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02475 * found. If the second parameter is present, it specifies the position in the 02476 * string to begin the search. 02477 * 02478 * "hello".index('e') #=> 1 02479 * "hello".index('lo') #=> 3 02480 * "hello".index('a') #=> nil 02481 * "hello".index(?e) #=> 1 02482 * "hello".index(/[aeiou]/, -3) #=> 4 02483 */ 02484 02485 static VALUE 02486 rb_str_index_m(int argc, VALUE *argv, VALUE str) 02487 { 02488 VALUE sub; 02489 VALUE initpos; 02490 long pos; 02491 02492 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 02493 pos = NUM2LONG(initpos); 02494 } 02495 else { 02496 pos = 0; 02497 } 02498 if (pos < 0) { 02499 pos += str_strlen(str, STR_ENC_GET(str)); 02500 if (pos < 0) { 02501 if (TYPE(sub) == T_REGEXP) { 02502 rb_backref_set(Qnil); 02503 } 02504 return Qnil; 02505 } 02506 } 02507 02508 switch (TYPE(sub)) { 02509 case T_REGEXP: 02510 if (pos > str_strlen(str, STR_ENC_GET(str))) 02511 return Qnil; 02512 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02513 rb_enc_check(str, sub), single_byte_optimizable(str)); 02514 02515 pos = rb_reg_search(sub, str, pos, 0); 02516 pos = rb_str_sublen(str, pos); 02517 break; 02518 02519 default: { 02520 VALUE tmp; 02521 02522 tmp = rb_check_string_type(sub); 02523 if (NIL_P(tmp)) { 02524 rb_raise(rb_eTypeError, "type mismatch: %s given", 02525 rb_obj_classname(sub)); 02526 } 02527 sub = tmp; 02528 } 02529 /* fall through */ 02530 case T_STRING: 02531 pos = rb_str_index(str, sub, pos); 02532 pos = rb_str_sublen(str, pos); 02533 break; 02534 } 02535 02536 if (pos == -1) return Qnil; 02537 return LONG2NUM(pos); 02538 } 02539 02540 static long 02541 rb_str_rindex(VALUE str, VALUE sub, long pos) 02542 { 02543 long len, slen; 02544 char *s, *sbeg, *e, *t; 02545 rb_encoding *enc; 02546 int singlebyte = single_byte_optimizable(str); 02547 02548 enc = rb_enc_check(str, sub); 02549 if (is_broken_string(sub)) { 02550 return -1; 02551 } 02552 len = str_strlen(str, enc); 02553 slen = str_strlen(sub, enc); 02554 /* substring longer than string */ 02555 if (len < slen) return -1; 02556 if (len - pos < slen) { 02557 pos = len - slen; 02558 } 02559 if (len == 0) { 02560 return pos; 02561 } 02562 sbeg = RSTRING_PTR(str); 02563 e = RSTRING_END(str); 02564 t = RSTRING_PTR(sub); 02565 slen = RSTRING_LEN(sub); 02566 s = str_nth(sbeg, e, pos, enc, singlebyte); 02567 while (s) { 02568 if (memcmp(s, t, slen) == 0) { 02569 return pos; 02570 } 02571 if (pos == 0) break; 02572 pos--; 02573 s = rb_enc_prev_char(sbeg, s, e, enc); 02574 } 02575 return -1; 02576 } 02577 02578 02579 /* 02580 * call-seq: 02581 * str.rindex(substring [, fixnum]) -> fixnum or nil 02582 * str.rindex(regexp [, fixnum]) -> fixnum or nil 02583 * 02584 * Returns the index of the last occurrence of the given <i>substring</i> or 02585 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02586 * found. If the second parameter is present, it specifies the position in the 02587 * string to end the search---characters beyond this point will not be 02588 * considered. 02589 * 02590 * "hello".rindex('e') #=> 1 02591 * "hello".rindex('l') #=> 3 02592 * "hello".rindex('a') #=> nil 02593 * "hello".rindex(?e) #=> 1 02594 * "hello".rindex(/[aeiou]/, -2) #=> 1 02595 */ 02596 02597 static VALUE 02598 rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 02599 { 02600 VALUE sub; 02601 VALUE vpos; 02602 rb_encoding *enc = STR_ENC_GET(str); 02603 long pos, len = str_strlen(str, enc); 02604 02605 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 02606 pos = NUM2LONG(vpos); 02607 if (pos < 0) { 02608 pos += len; 02609 if (pos < 0) { 02610 if (TYPE(sub) == T_REGEXP) { 02611 rb_backref_set(Qnil); 02612 } 02613 return Qnil; 02614 } 02615 } 02616 if (pos > len) pos = len; 02617 } 02618 else { 02619 pos = len; 02620 } 02621 02622 switch (TYPE(sub)) { 02623 case T_REGEXP: 02624 /* enc = rb_get_check(str, sub); */ 02625 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02626 STR_ENC_GET(str), single_byte_optimizable(str)); 02627 02628 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 02629 pos = rb_reg_search(sub, str, pos, 1); 02630 pos = rb_str_sublen(str, pos); 02631 } 02632 if (pos >= 0) return LONG2NUM(pos); 02633 break; 02634 02635 default: { 02636 VALUE tmp; 02637 02638 tmp = rb_check_string_type(sub); 02639 if (NIL_P(tmp)) { 02640 rb_raise(rb_eTypeError, "type mismatch: %s given", 02641 rb_obj_classname(sub)); 02642 } 02643 sub = tmp; 02644 } 02645 /* fall through */ 02646 case T_STRING: 02647 pos = rb_str_rindex(str, sub, pos); 02648 if (pos >= 0) return LONG2NUM(pos); 02649 break; 02650 } 02651 return Qnil; 02652 } 02653 02654 /* 02655 * call-seq: 02656 * str =~ obj -> fixnum or nil 02657 * 02658 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 02659 * against <i>str</i>,and returns the position the match starts, or 02660 * <code>nil</code> if there is no match. Otherwise, invokes 02661 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 02662 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 02663 * 02664 * "cat o' 9 tails" =~ /\d/ #=> 7 02665 * "cat o' 9 tails" =~ 9 #=> nil 02666 */ 02667 02668 static VALUE 02669 rb_str_match(VALUE x, VALUE y) 02670 { 02671 switch (TYPE(y)) { 02672 case T_STRING: 02673 rb_raise(rb_eTypeError, "type mismatch: String given"); 02674 02675 case T_REGEXP: 02676 return rb_reg_match(y, x); 02677 02678 default: 02679 return rb_funcall(y, rb_intern("=~"), 1, x); 02680 } 02681 } 02682 02683 02684 static VALUE get_pat(VALUE, int); 02685 02686 02687 /* 02688 * call-seq: 02689 * str.match(pattern) -> matchdata or nil 02690 * str.match(pattern, pos) -> matchdata or nil 02691 * 02692 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 02693 * then invokes its <code>match</code> method on <i>str</i>. If the second 02694 * parameter is present, it specifies the position in the string to begin the 02695 * search. 02696 * 02697 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 02698 * 'hello'.match('(.)\1')[0] #=> "ll" 02699 * 'hello'.match(/(.)\1/)[0] #=> "ll" 02700 * 'hello'.match('xx') #=> nil 02701 * 02702 * If a block is given, invoke the block with MatchData if match succeed, so 02703 * that you can write 02704 * 02705 * str.match(pat) {|m| ...} 02706 * 02707 * instead of 02708 * 02709 * if m = str.match(pat) 02710 * ... 02711 * end 02712 * 02713 * The return value is a value from block execution in this case. 02714 */ 02715 02716 static VALUE 02717 rb_str_match_m(int argc, VALUE *argv, VALUE str) 02718 { 02719 VALUE re, result; 02720 if (argc < 1) 02721 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 02722 re = argv[0]; 02723 argv[0] = str; 02724 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 02725 if (!NIL_P(result) && rb_block_given_p()) { 02726 return rb_yield(result); 02727 } 02728 return result; 02729 } 02730 02731 enum neighbor_char { 02732 NEIGHBOR_NOT_CHAR, 02733 NEIGHBOR_FOUND, 02734 NEIGHBOR_WRAPPED 02735 }; 02736 02737 static enum neighbor_char 02738 enc_succ_char(char *p, long len, rb_encoding *enc) 02739 { 02740 long i; 02741 int l; 02742 while (1) { 02743 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 02744 p[i] = '\0'; 02745 if (i < 0) 02746 return NEIGHBOR_WRAPPED; 02747 ++((unsigned char*)p)[i]; 02748 l = rb_enc_precise_mbclen(p, p+len, enc); 02749 if (MBCLEN_CHARFOUND_P(l)) { 02750 l = MBCLEN_CHARFOUND_LEN(l); 02751 if (l == len) { 02752 return NEIGHBOR_FOUND; 02753 } 02754 else { 02755 memset(p+l, 0xff, len-l); 02756 } 02757 } 02758 if (MBCLEN_INVALID_P(l) && i < len-1) { 02759 long len2; 02760 int l2; 02761 for (len2 = len-1; 0 < len2; len2--) { 02762 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02763 if (!MBCLEN_INVALID_P(l2)) 02764 break; 02765 } 02766 memset(p+len2+1, 0xff, len-(len2+1)); 02767 } 02768 } 02769 } 02770 02771 static enum neighbor_char 02772 enc_pred_char(char *p, long len, rb_encoding *enc) 02773 { 02774 long i; 02775 int l; 02776 while (1) { 02777 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 02778 p[i] = '\xff'; 02779 if (i < 0) 02780 return NEIGHBOR_WRAPPED; 02781 --((unsigned char*)p)[i]; 02782 l = rb_enc_precise_mbclen(p, p+len, enc); 02783 if (MBCLEN_CHARFOUND_P(l)) { 02784 l = MBCLEN_CHARFOUND_LEN(l); 02785 if (l == len) { 02786 return NEIGHBOR_FOUND; 02787 } 02788 else { 02789 memset(p+l, 0, len-l); 02790 } 02791 } 02792 if (MBCLEN_INVALID_P(l) && i < len-1) { 02793 long len2; 02794 int l2; 02795 for (len2 = len-1; 0 < len2; len2--) { 02796 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02797 if (!MBCLEN_INVALID_P(l2)) 02798 break; 02799 } 02800 memset(p+len2+1, 0, len-(len2+1)); 02801 } 02802 } 02803 } 02804 02805 /* 02806 overwrite +p+ by succeeding letter in +enc+ and returns 02807 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 02808 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 02809 assuming each ranges are successive, and mbclen 02810 never change in each ranges. 02811 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 02812 character. 02813 */ 02814 static enum neighbor_char 02815 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 02816 { 02817 enum neighbor_char ret; 02818 unsigned int c; 02819 int ctype; 02820 int range; 02821 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 02822 02823 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02824 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 02825 ctype = ONIGENC_CTYPE_DIGIT; 02826 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 02827 ctype = ONIGENC_CTYPE_ALPHA; 02828 else 02829 return NEIGHBOR_NOT_CHAR; 02830 02831 MEMCPY(save, p, char, len); 02832 ret = enc_succ_char(p, len, enc); 02833 if (ret == NEIGHBOR_FOUND) { 02834 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02835 if (rb_enc_isctype(c, ctype, enc)) 02836 return NEIGHBOR_FOUND; 02837 } 02838 MEMCPY(p, save, char, len); 02839 range = 1; 02840 while (1) { 02841 MEMCPY(save, p, char, len); 02842 ret = enc_pred_char(p, len, enc); 02843 if (ret == NEIGHBOR_FOUND) { 02844 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02845 if (!rb_enc_isctype(c, ctype, enc)) { 02846 MEMCPY(p, save, char, len); 02847 break; 02848 } 02849 } 02850 else { 02851 MEMCPY(p, save, char, len); 02852 break; 02853 } 02854 range++; 02855 } 02856 if (range == 1) { 02857 return NEIGHBOR_NOT_CHAR; 02858 } 02859 02860 if (ctype != ONIGENC_CTYPE_DIGIT) { 02861 MEMCPY(carry, p, char, len); 02862 return NEIGHBOR_WRAPPED; 02863 } 02864 02865 MEMCPY(carry, p, char, len); 02866 enc_succ_char(carry, len, enc); 02867 return NEIGHBOR_WRAPPED; 02868 } 02869 02870 02871 /* 02872 * call-seq: 02873 * str.succ -> new_str 02874 * str.next -> new_str 02875 * 02876 * Returns the successor to <i>str</i>. The successor is calculated by 02877 * incrementing characters starting from the rightmost alphanumeric (or 02878 * the rightmost character if there are no alphanumerics) in the 02879 * string. Incrementing a digit always results in another digit, and 02880 * incrementing a letter results in another letter of the same case. 02881 * Incrementing nonalphanumerics uses the underlying character set's 02882 * collating sequence. 02883 * 02884 * If the increment generates a ``carry,'' the character to the left of 02885 * it is incremented. This process repeats until there is no carry, 02886 * adding an additional character if necessary. 02887 * 02888 * "abcd".succ #=> "abce" 02889 * "THX1138".succ #=> "THX1139" 02890 * "<<koala>>".succ #=> "<<koalb>>" 02891 * "1999zzz".succ #=> "2000aaa" 02892 * "ZZZ9999".succ #=> "AAAA0000" 02893 * "***".succ #=> "**+" 02894 */ 02895 02896 VALUE 02897 rb_str_succ(VALUE orig) 02898 { 02899 rb_encoding *enc; 02900 VALUE str; 02901 char *sbeg, *s, *e, *last_alnum = 0; 02902 int c = -1; 02903 long l; 02904 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 02905 long carry_pos = 0, carry_len = 1; 02906 enum neighbor_char neighbor = NEIGHBOR_FOUND; 02907 02908 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 02909 rb_enc_cr_str_copy_for_substr(str, orig); 02910 OBJ_INFECT(str, orig); 02911 if (RSTRING_LEN(str) == 0) return str; 02912 02913 enc = STR_ENC_GET(orig); 02914 sbeg = RSTRING_PTR(str); 02915 s = e = sbeg + RSTRING_LEN(str); 02916 02917 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02918 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 02919 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 02920 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 02921 s = last_alnum; 02922 break; 02923 } 02924 } 02925 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02926 neighbor = enc_succ_alnum_char(s, l, enc, carry); 02927 switch (neighbor) { 02928 case NEIGHBOR_NOT_CHAR: 02929 continue; 02930 case NEIGHBOR_FOUND: 02931 return str; 02932 case NEIGHBOR_WRAPPED: 02933 last_alnum = s; 02934 break; 02935 } 02936 c = 1; 02937 carry_pos = s - sbeg; 02938 carry_len = l; 02939 } 02940 if (c == -1) { /* str contains no alnum */ 02941 s = e; 02942 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02943 enum neighbor_char neighbor; 02944 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02945 neighbor = enc_succ_char(s, l, enc); 02946 if (neighbor == NEIGHBOR_FOUND) 02947 return str; 02948 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 02949 /* wrapped to \0...\0. search next valid char. */ 02950 enc_succ_char(s, l, enc); 02951 } 02952 if (!rb_enc_asciicompat(enc)) { 02953 MEMCPY(carry, s, char, l); 02954 carry_len = l; 02955 } 02956 carry_pos = s - sbeg; 02957 } 02958 } 02959 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 02960 s = RSTRING_PTR(str) + carry_pos; 02961 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 02962 memmove(s, carry, carry_len); 02963 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 02964 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 02965 rb_enc_str_coderange(str); 02966 return str; 02967 } 02968 02969 02970 /* 02971 * call-seq: 02972 * str.succ! -> str 02973 * str.next! -> str 02974 * 02975 * Equivalent to <code>String#succ</code>, but modifies the receiver in 02976 * place. 02977 */ 02978 02979 static VALUE 02980 rb_str_succ_bang(VALUE str) 02981 { 02982 rb_str_shared_replace(str, rb_str_succ(str)); 02983 02984 return str; 02985 } 02986 02987 02988 /* 02989 * call-seq: 02990 * str.upto(other_str, exclusive=false) {|s| block } -> str 02991 * str.upto(other_str, exclusive=false) -> an_enumerator 02992 * 02993 * Iterates through successive values, starting at <i>str</i> and 02994 * ending at <i>other_str</i> inclusive, passing each value in turn to 02995 * the block. The <code>String#succ</code> method is used to generate 02996 * each value. If optional second argument exclusive is omitted or is false, 02997 * the last value will be included; otherwise it will be excluded. 02998 * 02999 * If no block is given, an enumerator is returned instead. 03000 * 03001 * "a8".upto("b6") {|s| print s, ' ' } 03002 * for s in "a8".."b6" 03003 * print s, ' ' 03004 * end 03005 * 03006 * <em>produces:</em> 03007 * 03008 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03009 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03010 * 03011 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 03012 * both are recognized as decimal numbers. In addition, the width of 03013 * string (e.g. leading zeros) is handled appropriately. 03014 * 03015 * "9".upto("11").to_a #=> ["9", "10", "11"] 03016 * "25".upto("5").to_a #=> [] 03017 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 03018 */ 03019 03020 static VALUE 03021 rb_str_upto(int argc, VALUE *argv, VALUE beg) 03022 { 03023 VALUE end, exclusive; 03024 VALUE current, after_end; 03025 ID succ; 03026 int n, excl, ascii; 03027 rb_encoding *enc; 03028 03029 rb_scan_args(argc, argv, "11", &end, &exclusive); 03030 RETURN_ENUMERATOR(beg, argc, argv); 03031 excl = RTEST(exclusive); 03032 CONST_ID(succ, "succ"); 03033 StringValue(end); 03034 enc = rb_enc_check(beg, end); 03035 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 03036 /* single character */ 03037 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 03038 char c = RSTRING_PTR(beg)[0]; 03039 char e = RSTRING_PTR(end)[0]; 03040 03041 if (c > e || (excl && c == e)) return beg; 03042 for (;;) { 03043 rb_yield(rb_enc_str_new(&c, 1, enc)); 03044 if (!excl && c == e) break; 03045 c++; 03046 if (excl && c == e) break; 03047 } 03048 return beg; 03049 } 03050 /* both edges are all digits */ 03051 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 03052 char *s, *send; 03053 VALUE b, e; 03054 int width; 03055 03056 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 03057 width = rb_long2int(send - s); 03058 while (s < send) { 03059 if (!ISDIGIT(*s)) goto no_digits; 03060 s++; 03061 } 03062 s = RSTRING_PTR(end); send = RSTRING_END(end); 03063 while (s < send) { 03064 if (!ISDIGIT(*s)) goto no_digits; 03065 s++; 03066 } 03067 b = rb_str_to_inum(beg, 10, FALSE); 03068 e = rb_str_to_inum(end, 10, FALSE); 03069 if (FIXNUM_P(b) && FIXNUM_P(e)) { 03070 long bi = FIX2LONG(b); 03071 long ei = FIX2LONG(e); 03072 rb_encoding *usascii = rb_usascii_encoding(); 03073 03074 while (bi <= ei) { 03075 if (excl && bi == ei) break; 03076 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 03077 bi++; 03078 } 03079 } 03080 else { 03081 ID op = excl ? '<' : rb_intern("<="); 03082 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 03083 03084 args[0] = INT2FIX(width); 03085 while (rb_funcall(b, op, 1, e)) { 03086 args[1] = b; 03087 rb_yield(rb_str_format(numberof(args), args, fmt)); 03088 b = rb_funcall(b, succ, 0, 0); 03089 } 03090 } 03091 return beg; 03092 } 03093 /* normal case */ 03094 no_digits: 03095 n = rb_str_cmp(beg, end); 03096 if (n > 0 || (excl && n == 0)) return beg; 03097 03098 after_end = rb_funcall(end, succ, 0, 0); 03099 current = rb_str_dup(beg); 03100 while (!rb_str_equal(current, after_end)) { 03101 VALUE next = Qnil; 03102 if (excl || !rb_str_equal(current, end)) 03103 next = rb_funcall(current, succ, 0, 0); 03104 rb_yield(current); 03105 if (NIL_P(next)) break; 03106 current = next; 03107 StringValue(current); 03108 if (excl && rb_str_equal(current, end)) break; 03109 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 03110 break; 03111 } 03112 03113 return beg; 03114 } 03115 03116 static VALUE 03117 rb_str_subpat(VALUE str, VALUE re, VALUE backref) 03118 { 03119 if (rb_reg_search(re, str, 0, 0) >= 0) { 03120 VALUE match = rb_backref_get(); 03121 int nth = rb_reg_backref_number(match, backref); 03122 return rb_reg_nth_match(nth, match); 03123 } 03124 return Qnil; 03125 } 03126 03127 static VALUE 03128 rb_str_aref(VALUE str, VALUE indx) 03129 { 03130 long idx; 03131 03132 switch (TYPE(indx)) { 03133 case T_FIXNUM: 03134 idx = FIX2LONG(indx); 03135 03136 num_index: 03137 str = rb_str_substr(str, idx, 1); 03138 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 03139 return str; 03140 03141 case T_REGEXP: 03142 return rb_str_subpat(str, indx, INT2FIX(0)); 03143 03144 case T_STRING: 03145 if (rb_str_index(str, indx, 0) != -1) 03146 return rb_str_dup(indx); 03147 return Qnil; 03148 03149 default: 03150 /* check if indx is Range */ 03151 { 03152 long beg, len; 03153 VALUE tmp; 03154 03155 len = str_strlen(str, STR_ENC_GET(str)); 03156 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 03157 case Qfalse: 03158 break; 03159 case Qnil: 03160 return Qnil; 03161 default: 03162 tmp = rb_str_substr(str, beg, len); 03163 return tmp; 03164 } 03165 } 03166 idx = NUM2LONG(indx); 03167 goto num_index; 03168 } 03169 return Qnil; /* not reached */ 03170 } 03171 03172 03173 /* 03174 * call-seq: 03175 * str[fixnum] -> new_str or nil 03176 * str[fixnum, fixnum] -> new_str or nil 03177 * str[range] -> new_str or nil 03178 * str[regexp] -> new_str or nil 03179 * str[regexp, fixnum] -> new_str or nil 03180 * str[other_str] -> new_str or nil 03181 * str.slice(fixnum) -> new_str or nil 03182 * str.slice(fixnum, fixnum) -> new_str or nil 03183 * str.slice(range) -> new_str or nil 03184 * str.slice(regexp) -> new_str or nil 03185 * str.slice(regexp, fixnum) -> new_str or nil 03186 * str.slice(regexp, capname) -> new_str or nil 03187 * str.slice(other_str) -> new_str or nil 03188 * 03189 * Element Reference---If passed a single <code>Fixnum</code>, returns a 03190 * substring of one character at that position. If passed two <code>Fixnum</code> 03191 * objects, returns a substring starting at the offset given by the first, and 03192 * with a length given by the second. If passed a range, its beginning and end 03193 * are interpreted as offsets delimiting the substring to be returned. In all 03194 * three cases, if an offset is negative, it is counted from the end of <i>str</i>. 03195 * Returns <code>nil</code> if the initial offset falls outside the string or 03196 * the length is negative. 03197 * 03198 * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is 03199 * returned. If a numeric or name parameter follows the regular expression, that 03200 * component of the <code>MatchData</code> is returned instead. If a 03201 * <code>String</code> is given, that string is returned if it occurs in 03202 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no 03203 * match. 03204 * 03205 * a = "hello there" 03206 * a[1] #=> "e" 03207 * a[2, 3] #=> "llo" 03208 * a[2..3] #=> "ll" 03209 * a[-3, 2] #=> "er" 03210 * a[7..-2] #=> "her" 03211 * a[-4..-2] #=> "her" 03212 * a[-2..-4] #=> "" 03213 * a[12..-1] #=> nil 03214 * a[/[aeiou](.)\1/] #=> "ell" 03215 * a[/[aeiou](.)\1/, 0] #=> "ell" 03216 * a[/[aeiou](.)\1/, 1] #=> "l" 03217 * a[/[aeiou](.)\1/, 2] #=> nil 03218 * a["lo"] #=> "lo" 03219 * a["bye"] #=> nil 03220 */ 03221 03222 static VALUE 03223 rb_str_aref_m(int argc, VALUE *argv, VALUE str) 03224 { 03225 if (argc == 2) { 03226 if (TYPE(argv[0]) == T_REGEXP) { 03227 return rb_str_subpat(str, argv[0], argv[1]); 03228 } 03229 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 03230 } 03231 if (argc != 1) { 03232 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03233 } 03234 return rb_str_aref(str, argv[0]); 03235 } 03236 03237 VALUE 03238 rb_str_drop_bytes(VALUE str, long len) 03239 { 03240 char *ptr = RSTRING_PTR(str); 03241 long olen = RSTRING_LEN(str), nlen; 03242 03243 str_modifiable(str); 03244 if (len > olen) len = olen; 03245 nlen = olen - len; 03246 if (nlen <= RSTRING_EMBED_LEN_MAX) { 03247 char *oldptr = ptr; 03248 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 03249 STR_SET_EMBED(str); 03250 STR_SET_EMBED_LEN(str, nlen); 03251 ptr = RSTRING(str)->as.ary; 03252 memmove(ptr, oldptr + len, nlen); 03253 if (fl == STR_NOEMBED) xfree(oldptr); 03254 } 03255 else { 03256 if (!STR_SHARED_P(str)) rb_str_new4(str); 03257 ptr = RSTRING(str)->as.heap.ptr += len; 03258 RSTRING(str)->as.heap.len = nlen; 03259 } 03260 ptr[nlen] = 0; 03261 ENC_CODERANGE_CLEAR(str); 03262 return str; 03263 } 03264 03265 static void 03266 rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 03267 { 03268 if (beg == 0 && RSTRING_LEN(val) == 0) { 03269 rb_str_drop_bytes(str, len); 03270 OBJ_INFECT(str, val); 03271 return; 03272 } 03273 03274 rb_str_modify(str); 03275 if (len < RSTRING_LEN(val)) { 03276 /* expand string */ 03277 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 03278 } 03279 03280 if (RSTRING_LEN(val) != len) { 03281 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 03282 RSTRING_PTR(str) + beg + len, 03283 RSTRING_LEN(str) - (beg + len)); 03284 } 03285 if (RSTRING_LEN(val) < beg && len < 0) { 03286 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 03287 } 03288 if (RSTRING_LEN(val) > 0) { 03289 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 03290 } 03291 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 03292 if (RSTRING_PTR(str)) { 03293 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03294 } 03295 OBJ_INFECT(str, val); 03296 } 03297 03298 static void 03299 rb_str_splice(VALUE str, long beg, long len, VALUE val) 03300 { 03301 long slen; 03302 char *p, *e; 03303 rb_encoding *enc; 03304 int singlebyte = single_byte_optimizable(str); 03305 int cr; 03306 03307 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 03308 03309 StringValue(val); 03310 enc = rb_enc_check(str, val); 03311 slen = str_strlen(str, enc); 03312 03313 if (slen < beg) { 03314 out_of_range: 03315 rb_raise(rb_eIndexError, "index %ld out of string", beg); 03316 } 03317 if (beg < 0) { 03318 if (-beg > slen) { 03319 goto out_of_range; 03320 } 03321 beg += slen; 03322 } 03323 if (slen < len || slen < beg + len) { 03324 len = slen - beg; 03325 } 03326 str_modify_keep_cr(str); 03327 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 03328 if (!p) p = RSTRING_END(str); 03329 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 03330 if (!e) e = RSTRING_END(str); 03331 /* error check */ 03332 beg = p - RSTRING_PTR(str); /* physical position */ 03333 len = e - p; /* physical length */ 03334 rb_str_splice_0(str, beg, len, val); 03335 rb_enc_associate(str, enc); 03336 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 03337 if (cr != ENC_CODERANGE_BROKEN) 03338 ENC_CODERANGE_SET(str, cr); 03339 } 03340 03341 void 03342 rb_str_update(VALUE str, long beg, long len, VALUE val) 03343 { 03344 rb_str_splice(str, beg, len, val); 03345 } 03346 03347 static void 03348 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 03349 { 03350 int nth; 03351 VALUE match; 03352 long start, end, len; 03353 rb_encoding *enc; 03354 struct re_registers *regs; 03355 03356 if (rb_reg_search(re, str, 0, 0) < 0) { 03357 rb_raise(rb_eIndexError, "regexp not matched"); 03358 } 03359 match = rb_backref_get(); 03360 nth = rb_reg_backref_number(match, backref); 03361 regs = RMATCH_REGS(match); 03362 if (nth >= regs->num_regs) { 03363 out_of_range: 03364 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 03365 } 03366 if (nth < 0) { 03367 if (-nth >= regs->num_regs) { 03368 goto out_of_range; 03369 } 03370 nth += regs->num_regs; 03371 } 03372 03373 start = BEG(nth); 03374 if (start == -1) { 03375 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 03376 } 03377 end = END(nth); 03378 len = end - start; 03379 StringValue(val); 03380 enc = rb_enc_check(str, val); 03381 rb_str_splice_0(str, start, len, val); 03382 rb_enc_associate(str, enc); 03383 } 03384 03385 static VALUE 03386 rb_str_aset(VALUE str, VALUE indx, VALUE val) 03387 { 03388 long idx, beg; 03389 03390 switch (TYPE(indx)) { 03391 case T_FIXNUM: 03392 idx = FIX2LONG(indx); 03393 num_index: 03394 rb_str_splice(str, idx, 1, val); 03395 return val; 03396 03397 case T_REGEXP: 03398 rb_str_subpat_set(str, indx, INT2FIX(0), val); 03399 return val; 03400 03401 case T_STRING: 03402 beg = rb_str_index(str, indx, 0); 03403 if (beg < 0) { 03404 rb_raise(rb_eIndexError, "string not matched"); 03405 } 03406 beg = rb_str_sublen(str, beg); 03407 rb_str_splice(str, beg, str_strlen(indx, 0), val); 03408 return val; 03409 03410 default: 03411 /* check if indx is Range */ 03412 { 03413 long beg, len; 03414 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 03415 rb_str_splice(str, beg, len, val); 03416 return val; 03417 } 03418 } 03419 idx = NUM2LONG(indx); 03420 goto num_index; 03421 } 03422 } 03423 03424 /* 03425 * call-seq: 03426 * str[fixnum] = new_str 03427 * str[fixnum, fixnum] = new_str 03428 * str[range] = aString 03429 * str[regexp] = new_str 03430 * str[regexp, fixnum] = new_str 03431 * str[regexp, name] = new_str 03432 * str[other_str] = new_str 03433 * 03434 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 03435 * portion of the string affected is determined using the same criteria as 03436 * <code>String#[]</code>. If the replacement string is not the same length as 03437 * the text it is replacing, the string will be adjusted accordingly. If the 03438 * regular expression or string is used as the index doesn't match a position 03439 * in the string, <code>IndexError</code> is raised. If the regular expression 03440 * form is used, the optional second <code>Fixnum</code> allows you to specify 03441 * which portion of the match to replace (effectively using the 03442 * <code>MatchData</code> indexing rules. The forms that take a 03443 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 03444 * out of range; the <code>Range</code> form will raise a 03445 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 03446 * forms will silently ignore the assignment. 03447 */ 03448 03449 static VALUE 03450 rb_str_aset_m(int argc, VALUE *argv, VALUE str) 03451 { 03452 if (argc == 3) { 03453 if (TYPE(argv[0]) == T_REGEXP) { 03454 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 03455 } 03456 else { 03457 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 03458 } 03459 return argv[2]; 03460 } 03461 if (argc != 2) { 03462 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc); 03463 } 03464 return rb_str_aset(str, argv[0], argv[1]); 03465 } 03466 03467 /* 03468 * call-seq: 03469 * str.insert(index, other_str) -> str 03470 * 03471 * Inserts <i>other_str</i> before the character at the given 03472 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 03473 * end of the string, and insert <em>after</em> the given character. 03474 * The intent is insert <i>aString</i> so that it starts at the given 03475 * <i>index</i>. 03476 * 03477 * "abcd".insert(0, 'X') #=> "Xabcd" 03478 * "abcd".insert(3, 'X') #=> "abcXd" 03479 * "abcd".insert(4, 'X') #=> "abcdX" 03480 * "abcd".insert(-3, 'X') #=> "abXcd" 03481 * "abcd".insert(-1, 'X') #=> "abcdX" 03482 */ 03483 03484 static VALUE 03485 rb_str_insert(VALUE str, VALUE idx, VALUE str2) 03486 { 03487 long pos = NUM2LONG(idx); 03488 03489 if (pos == -1) { 03490 return rb_str_append(str, str2); 03491 } 03492 else if (pos < 0) { 03493 pos++; 03494 } 03495 rb_str_splice(str, pos, 0, str2); 03496 return str; 03497 } 03498 03499 03500 /* 03501 * call-seq: 03502 * str.slice!(fixnum) -> fixnum or nil 03503 * str.slice!(fixnum, fixnum) -> new_str or nil 03504 * str.slice!(range) -> new_str or nil 03505 * str.slice!(regexp) -> new_str or nil 03506 * str.slice!(other_str) -> new_str or nil 03507 * 03508 * Deletes the specified portion from <i>str</i>, and returns the portion 03509 * deleted. 03510 * 03511 * string = "this is a string" 03512 * string.slice!(2) #=> "i" 03513 * string.slice!(3..6) #=> " is " 03514 * string.slice!(/s.*t/) #=> "sa st" 03515 * string.slice!("r") #=> "r" 03516 * string #=> "thing" 03517 */ 03518 03519 static VALUE 03520 rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 03521 { 03522 VALUE result; 03523 VALUE buf[3]; 03524 int i; 03525 03526 if (argc < 1 || 2 < argc) { 03527 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03528 } 03529 for (i=0; i<argc; i++) { 03530 buf[i] = argv[i]; 03531 } 03532 str_modify_keep_cr(str); 03533 result = rb_str_aref_m(argc, buf, str); 03534 if (!NIL_P(result)) { 03535 buf[i] = rb_str_new(0,0); 03536 rb_str_aset_m(argc+1, buf, str); 03537 } 03538 return result; 03539 } 03540 03541 static VALUE 03542 get_pat(VALUE pat, int quote) 03543 { 03544 VALUE val; 03545 03546 switch (TYPE(pat)) { 03547 case T_REGEXP: 03548 return pat; 03549 03550 case T_STRING: 03551 break; 03552 03553 default: 03554 val = rb_check_string_type(pat); 03555 if (NIL_P(val)) { 03556 Check_Type(pat, T_REGEXP); 03557 } 03558 pat = val; 03559 } 03560 03561 if (quote) { 03562 pat = rb_reg_quote(pat); 03563 } 03564 03565 return rb_reg_regcomp(pat); 03566 } 03567 03568 03569 /* 03570 * call-seq: 03571 * str.sub!(pattern, replacement) -> str or nil 03572 * str.sub!(pattern) {|match| block } -> str or nil 03573 * 03574 * Performs the substitutions of <code>String#sub</code> in place, 03575 * returning <i>str</i>, or <code>nil</code> if no substitutions were 03576 * performed. 03577 */ 03578 03579 static VALUE 03580 rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 03581 { 03582 VALUE pat, repl, hash = Qnil; 03583 int iter = 0; 03584 int tainted = 0; 03585 int untrusted = 0; 03586 long plen; 03587 03588 if (argc == 1 && rb_block_given_p()) { 03589 iter = 1; 03590 } 03591 else if (argc == 2) { 03592 repl = argv[1]; 03593 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03594 if (NIL_P(hash)) { 03595 StringValue(repl); 03596 } 03597 if (OBJ_TAINTED(repl)) tainted = 1; 03598 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03599 } 03600 else { 03601 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03602 } 03603 03604 pat = get_pat(argv[0], 1); 03605 str_modifiable(str); 03606 if (rb_reg_search(pat, str, 0, 0) >= 0) { 03607 rb_encoding *enc; 03608 int cr = ENC_CODERANGE(str); 03609 VALUE match = rb_backref_get(); 03610 struct re_registers *regs = RMATCH_REGS(match); 03611 long beg0 = BEG(0); 03612 long end0 = END(0); 03613 char *p, *rp; 03614 long len, rlen; 03615 03616 if (iter || !NIL_P(hash)) { 03617 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03618 03619 if (iter) { 03620 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03621 } 03622 else { 03623 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 03624 repl = rb_obj_as_string(repl); 03625 } 03626 str_mod_check(str, p, len); 03627 rb_check_frozen(str); 03628 } 03629 else { 03630 repl = rb_reg_regsub(repl, str, regs, pat); 03631 } 03632 enc = rb_enc_compatible(str, repl); 03633 if (!enc) { 03634 rb_encoding *str_enc = STR_ENC_GET(str); 03635 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03636 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 03637 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 03638 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 03639 rb_enc_name(str_enc), 03640 rb_enc_name(STR_ENC_GET(repl))); 03641 } 03642 enc = STR_ENC_GET(repl); 03643 } 03644 rb_str_modify(str); 03645 rb_enc_associate(str, enc); 03646 if (OBJ_TAINTED(repl)) tainted = 1; 03647 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03648 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 03649 int cr2 = ENC_CODERANGE(repl); 03650 if (cr2 == ENC_CODERANGE_BROKEN || 03651 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 03652 cr = ENC_CODERANGE_UNKNOWN; 03653 else 03654 cr = cr2; 03655 } 03656 plen = end0 - beg0; 03657 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 03658 len = RSTRING_LEN(str); 03659 if (rlen > plen) { 03660 RESIZE_CAPA(str, len + rlen - plen); 03661 } 03662 p = RSTRING_PTR(str); 03663 if (rlen != plen) { 03664 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 03665 } 03666 memcpy(p + beg0, rp, rlen); 03667 len += rlen - plen; 03668 STR_SET_LEN(str, len); 03669 RSTRING_PTR(str)[len] = '\0'; 03670 ENC_CODERANGE_SET(str, cr); 03671 if (tainted) OBJ_TAINT(str); 03672 if (untrusted) OBJ_UNTRUST(str); 03673 03674 return str; 03675 } 03676 return Qnil; 03677 } 03678 03679 03680 /* 03681 * call-seq: 03682 * str.sub(pattern, replacement) -> new_str 03683 * str.sub(pattern, hash) -> new_str 03684 * str.sub(pattern) {|match| block } -> new_str 03685 * 03686 * Returns a copy of <i>str</i> with the <em>first</em> occurrence of 03687 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03688 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03689 * regular expression metacharacters it contains will be interpreted 03690 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03691 * instead of a digit. 03692 * 03693 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03694 * the matched text. It may contain back-references to the pattern's capture 03695 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03696 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03697 * double-quoted string, both back-references must be preceded by an 03698 * additional backslash. However, within <i>replacement</i> the special match 03699 * variables, such as <code>&$</code>, will not refer to the current match. 03700 * 03701 * If the second argument is a <code>Hash</code>, and the matched text is one 03702 * of its keys, the corresponding value is the replacement string. 03703 * 03704 * In the block form, the current match string is passed in as a parameter, 03705 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03706 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03707 * returned by the block will be substituted for the match on each call. 03708 * 03709 * The result inherits any tainting in the original string or any supplied 03710 * replacement string. 03711 * 03712 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 03713 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 03714 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 03715 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 03716 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 03717 * #=> "Is /bin/bash your preferred shell?" 03718 */ 03719 03720 static VALUE 03721 rb_str_sub(int argc, VALUE *argv, VALUE str) 03722 { 03723 str = rb_str_dup(str); 03724 rb_str_sub_bang(argc, argv, str); 03725 return str; 03726 } 03727 03728 static VALUE 03729 str_gsub(int argc, VALUE *argv, VALUE str, int bang) 03730 { 03731 VALUE pat, val, repl, match, dest, hash = Qnil; 03732 struct re_registers *regs; 03733 long beg, n; 03734 long beg0, end0; 03735 long offset, blen, slen, len, last; 03736 int iter = 0; 03737 char *sp, *cp; 03738 int tainted = 0; 03739 rb_encoding *str_enc; 03740 03741 switch (argc) { 03742 case 1: 03743 RETURN_ENUMERATOR(str, argc, argv); 03744 iter = 1; 03745 break; 03746 case 2: 03747 repl = argv[1]; 03748 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03749 if (NIL_P(hash)) { 03750 StringValue(repl); 03751 } 03752 if (OBJ_TAINTED(repl)) tainted = 1; 03753 break; 03754 default: 03755 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03756 } 03757 03758 pat = get_pat(argv[0], 1); 03759 beg = rb_reg_search(pat, str, 0, 0); 03760 if (beg < 0) { 03761 if (bang) return Qnil; /* no match, no substitution */ 03762 return rb_str_dup(str); 03763 } 03764 03765 offset = 0; 03766 n = 0; 03767 blen = RSTRING_LEN(str) + 30; /* len + margin */ 03768 dest = rb_str_buf_new(blen); 03769 sp = RSTRING_PTR(str); 03770 slen = RSTRING_LEN(str); 03771 cp = sp; 03772 str_enc = STR_ENC_GET(str); 03773 rb_enc_associate(dest, str_enc); 03774 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 03775 03776 do { 03777 n++; 03778 match = rb_backref_get(); 03779 regs = RMATCH_REGS(match); 03780 beg0 = BEG(0); 03781 end0 = END(0); 03782 if (iter || !NIL_P(hash)) { 03783 if (iter) { 03784 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03785 } 03786 else { 03787 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 03788 val = rb_obj_as_string(val); 03789 } 03790 str_mod_check(str, sp, slen); 03791 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 03792 rb_raise(rb_eRuntimeError, "block should not cheat"); 03793 } 03794 } 03795 else { 03796 val = rb_reg_regsub(repl, str, regs, pat); 03797 } 03798 03799 if (OBJ_TAINTED(val)) tainted = 1; 03800 03801 len = beg - offset; /* copy pre-match substr */ 03802 if (len) { 03803 rb_enc_str_buf_cat(dest, cp, len, str_enc); 03804 } 03805 03806 rb_str_buf_append(dest, val); 03807 03808 last = offset; 03809 offset = end0; 03810 if (beg0 == end0) { 03811 /* 03812 * Always consume at least one character of the input string 03813 * in order to prevent infinite loops. 03814 */ 03815 if (RSTRING_LEN(str) <= end0) break; 03816 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 03817 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 03818 offset = end0 + len; 03819 } 03820 cp = RSTRING_PTR(str) + offset; 03821 if (offset > RSTRING_LEN(str)) break; 03822 beg = rb_reg_search(pat, str, offset, 0); 03823 } while (beg >= 0); 03824 if (RSTRING_LEN(str) > offset) { 03825 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 03826 } 03827 rb_reg_search(pat, str, last, 0); 03828 if (bang) { 03829 rb_str_shared_replace(str, dest); 03830 } 03831 else { 03832 RBASIC(dest)->klass = rb_obj_class(str); 03833 OBJ_INFECT(dest, str); 03834 str = dest; 03835 } 03836 03837 if (tainted) OBJ_TAINT(str); 03838 return str; 03839 } 03840 03841 03842 /* 03843 * call-seq: 03844 * str.gsub!(pattern, replacement) -> str or nil 03845 * str.gsub!(pattern) {|match| block } -> str or nil 03846 * str.gsub!(pattern) -> an_enumerator 03847 * 03848 * Performs the substitutions of <code>String#gsub</code> in place, returning 03849 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 03850 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 03851 */ 03852 03853 static VALUE 03854 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 03855 { 03856 str_modify_keep_cr(str); 03857 return str_gsub(argc, argv, str, 1); 03858 } 03859 03860 03861 /* 03862 * call-seq: 03863 * str.gsub(pattern, replacement) -> new_str 03864 * str.gsub(pattern, hash) -> new_str 03865 * str.gsub(pattern) {|match| block } -> new_str 03866 * str.gsub(pattern) -> enumerator 03867 * 03868 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 03869 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03870 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03871 * regular expression metacharacters it contains will be interpreted 03872 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03873 * instead of a digit. 03874 * 03875 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03876 * the matched text. It may contain back-references to the pattern's capture 03877 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03878 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03879 * double-quoted string, both back-references must be preceded by an 03880 * additional backslash. However, within <i>replacement</i> the special match 03881 * variables, such as <code>&$</code>, will not refer to the current match. 03882 * 03883 * If the second argument is a <code>Hash</code>, and the matched text is one 03884 * of its keys, the corresponding value is the replacement string. 03885 * 03886 * In the block form, the current match string is passed in as a parameter, 03887 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03888 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03889 * returned by the block will be substituted for the match on each call. 03890 * 03891 * The result inherits any tainting in the original string or any supplied 03892 * replacement string. 03893 * 03894 * When neither a block nor a second argument is supplied, an 03895 * <code>Enumerator</code> is returned. 03896 * 03897 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 03898 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 03899 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 03900 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 03901 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 03902 */ 03903 03904 static VALUE 03905 rb_str_gsub(int argc, VALUE *argv, VALUE str) 03906 { 03907 return str_gsub(argc, argv, str, 0); 03908 } 03909 03910 03911 /* 03912 * call-seq: 03913 * str.replace(other_str) -> str 03914 * 03915 * Replaces the contents and taintedness of <i>str</i> with the corresponding 03916 * values in <i>other_str</i>. 03917 * 03918 * s = "hello" #=> "hello" 03919 * s.replace "world" #=> "world" 03920 */ 03921 03922 VALUE 03923 rb_str_replace(VALUE str, VALUE str2) 03924 { 03925 str_modifiable(str); 03926 if (str == str2) return str; 03927 03928 StringValue(str2); 03929 str_discard(str); 03930 return str_replace(str, str2); 03931 } 03932 03933 /* 03934 * call-seq: 03935 * string.clear -> string 03936 * 03937 * Makes string empty. 03938 * 03939 * a = "abcde" 03940 * a.clear #=> "" 03941 */ 03942 03943 static VALUE 03944 rb_str_clear(VALUE str) 03945 { 03946 str_discard(str); 03947 STR_SET_EMBED(str); 03948 STR_SET_EMBED_LEN(str, 0); 03949 RSTRING_PTR(str)[0] = 0; 03950 if (rb_enc_asciicompat(STR_ENC_GET(str))) 03951 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 03952 else 03953 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 03954 return str; 03955 } 03956 03957 /* 03958 * call-seq: 03959 * string.chr -> string 03960 * 03961 * Returns a one-character string at the beginning of the string. 03962 * 03963 * a = "abcde" 03964 * a.chr #=> "a" 03965 */ 03966 03967 static VALUE 03968 rb_str_chr(VALUE str) 03969 { 03970 return rb_str_substr(str, 0, 1); 03971 } 03972 03973 /* 03974 * call-seq: 03975 * str.getbyte(index) -> 0 .. 255 03976 * 03977 * returns the <i>index</i>th byte as an integer. 03978 */ 03979 static VALUE 03980 rb_str_getbyte(VALUE str, VALUE index) 03981 { 03982 long pos = NUM2LONG(index); 03983 03984 if (pos < 0) 03985 pos += RSTRING_LEN(str); 03986 if (pos < 0 || RSTRING_LEN(str) <= pos) 03987 return Qnil; 03988 03989 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 03990 } 03991 03992 /* 03993 * call-seq: 03994 * str.setbyte(index, int) -> int 03995 * 03996 * modifies the <i>index</i>th byte as <i>int</i>. 03997 */ 03998 static VALUE 03999 rb_str_setbyte(VALUE str, VALUE index, VALUE value) 04000 { 04001 long pos = NUM2LONG(index); 04002 int byte = NUM2INT(value); 04003 04004 rb_str_modify(str); 04005 04006 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 04007 rb_raise(rb_eIndexError, "index %ld out of string", pos); 04008 if (pos < 0) 04009 pos += RSTRING_LEN(str); 04010 04011 RSTRING_PTR(str)[pos] = byte; 04012 04013 return value; 04014 } 04015 04016 static VALUE 04017 str_byte_substr(VALUE str, long beg, long len) 04018 { 04019 char *p, *s = RSTRING_PTR(str); 04020 long n = RSTRING_LEN(str); 04021 VALUE str2; 04022 04023 if (beg > n || len < 0) return Qnil; 04024 if (beg < 0) { 04025 beg += n; 04026 if (beg < 0) return Qnil; 04027 } 04028 if (beg + len > n) 04029 len = n - beg; 04030 if (len <= 0) { 04031 len = 0; 04032 p = 0; 04033 } 04034 else 04035 p = s + beg; 04036 04037 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 04038 str2 = rb_str_new4(str); 04039 str2 = str_new3(rb_obj_class(str2), str2); 04040 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 04041 RSTRING(str2)->as.heap.len = len; 04042 } 04043 else { 04044 str2 = rb_str_new5(str, p, len); 04045 } 04046 04047 str_enc_copy(str2, str); 04048 04049 if (RSTRING_LEN(str2) == 0) { 04050 if (!rb_enc_asciicompat(STR_ENC_GET(str))) 04051 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 04052 else 04053 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04054 } 04055 else { 04056 switch (ENC_CODERANGE(str)) { 04057 case ENC_CODERANGE_7BIT: 04058 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04059 break; 04060 default: 04061 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); 04062 break; 04063 } 04064 } 04065 04066 OBJ_INFECT(str2, str); 04067 04068 return str2; 04069 } 04070 04071 static VALUE 04072 str_byte_aref(VALUE str, VALUE indx) 04073 { 04074 long idx; 04075 switch (TYPE(indx)) { 04076 case T_FIXNUM: 04077 idx = FIX2LONG(indx); 04078 04079 num_index: 04080 str = str_byte_substr(str, idx, 1); 04081 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 04082 return str; 04083 04084 default: 04085 /* check if indx is Range */ 04086 { 04087 long beg, len = RSTRING_LEN(str); 04088 04089 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 04090 case Qfalse: 04091 break; 04092 case Qnil: 04093 return Qnil; 04094 default: 04095 return str_byte_substr(str, beg, len); 04096 } 04097 } 04098 idx = NUM2LONG(indx); 04099 goto num_index; 04100 } 04101 return Qnil; /* not reached */ 04102 } 04103 04104 /* 04105 * call-seq: 04106 * str.byteslice(fixnum) -> new_str or nil 04107 * str.byteslice(fixnum, fixnum) -> new_str or nil 04108 * str.byteslice(range) -> new_str or nil 04109 * 04110 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 04111 * substring of one byte at that position. If passed two <code>Fixnum</code> 04112 * objects, returns a substring starting at the offset given by the first, and 04113 * a length given by the second. If given a <code>Range</code>, a substring containing 04114 * bytes at offsets given by the range is returned. In all three cases, if 04115 * an offset is negative, it is counted from the end of <i>str</i>. Returns 04116 * <code>nil</code> if the initial offset falls outside the string, the length 04117 * is negative, or the beginning of the range is greater than the end. 04118 * The encoding of the resulted string keeps original encoding. 04119 * 04120 * "hello".byteslice(1) #=> "e" 04121 * "hello".byteslice(-1) #=> "o" 04122 * "hello".byteslice(1, 2) #=> "el" 04123 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 04124 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942" 04125 */ 04126 04127 static VALUE 04128 rb_str_byteslice(int argc, VALUE *argv, VALUE str) 04129 { 04130 if (argc == 2) { 04131 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 04132 } 04133 if (argc != 1) { 04134 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 04135 } 04136 return str_byte_aref(str, argv[0]); 04137 } 04138 04139 /* 04140 * call-seq: 04141 * str.reverse -> new_str 04142 * 04143 * Returns a new string with the characters from <i>str</i> in reverse order. 04144 * 04145 * "stressed".reverse #=> "desserts" 04146 */ 04147 04148 static VALUE 04149 rb_str_reverse(VALUE str) 04150 { 04151 rb_encoding *enc; 04152 VALUE rev; 04153 char *s, *e, *p; 04154 int single = 1; 04155 04156 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 04157 enc = STR_ENC_GET(str); 04158 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 04159 s = RSTRING_PTR(str); e = RSTRING_END(str); 04160 p = RSTRING_END(rev); 04161 04162 if (RSTRING_LEN(str) > 1) { 04163 if (single_byte_optimizable(str)) { 04164 while (s < e) { 04165 *--p = *s++; 04166 } 04167 } 04168 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 04169 while (s < e) { 04170 int clen = rb_enc_fast_mbclen(s, e, enc); 04171 04172 if (clen > 1 || (*s & 0x80)) single = 0; 04173 p -= clen; 04174 memcpy(p, s, clen); 04175 s += clen; 04176 } 04177 } 04178 else { 04179 while (s < e) { 04180 int clen = rb_enc_mbclen(s, e, enc); 04181 04182 if (clen > 1 || (*s & 0x80)) single = 0; 04183 p -= clen; 04184 memcpy(p, s, clen); 04185 s += clen; 04186 } 04187 } 04188 } 04189 STR_SET_LEN(rev, RSTRING_LEN(str)); 04190 OBJ_INFECT(rev, str); 04191 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 04192 if (single) { 04193 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04194 } 04195 else { 04196 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04197 } 04198 } 04199 rb_enc_cr_str_copy_for_substr(rev, str); 04200 04201 return rev; 04202 } 04203 04204 04205 /* 04206 * call-seq: 04207 * str.reverse! -> str 04208 * 04209 * Reverses <i>str</i> in place. 04210 */ 04211 04212 static VALUE 04213 rb_str_reverse_bang(VALUE str) 04214 { 04215 if (RSTRING_LEN(str) > 1) { 04216 if (single_byte_optimizable(str)) { 04217 char *s, *e, c; 04218 04219 str_modify_keep_cr(str); 04220 s = RSTRING_PTR(str); 04221 e = RSTRING_END(str) - 1; 04222 while (s < e) { 04223 c = *s; 04224 *s++ = *e; 04225 *e-- = c; 04226 } 04227 } 04228 else { 04229 rb_str_shared_replace(str, rb_str_reverse(str)); 04230 } 04231 } 04232 else { 04233 str_modify_keep_cr(str); 04234 } 04235 return str; 04236 } 04237 04238 04239 /* 04240 * call-seq: 04241 * str.include? other_str -> true or false 04242 * 04243 * Returns <code>true</code> if <i>str</i> contains the given string or 04244 * character. 04245 * 04246 * "hello".include? "lo" #=> true 04247 * "hello".include? "ol" #=> false 04248 * "hello".include? ?h #=> true 04249 */ 04250 04251 static VALUE 04252 rb_str_include(VALUE str, VALUE arg) 04253 { 04254 long i; 04255 04256 StringValue(arg); 04257 i = rb_str_index(str, arg, 0); 04258 04259 if (i == -1) return Qfalse; 04260 return Qtrue; 04261 } 04262 04263 04264 /* 04265 * call-seq: 04266 * str.to_i(base=10) -> integer 04267 * 04268 * Returns the result of interpreting leading characters in <i>str</i> as an 04269 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 04270 * end of a valid number are ignored. If there is not a valid number at the 04271 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 04272 * exception when <i>base</i> is valid. 04273 * 04274 * "12345".to_i #=> 12345 04275 * "99 red balloons".to_i #=> 99 04276 * "0a".to_i #=> 0 04277 * "0a".to_i(16) #=> 10 04278 * "hello".to_i #=> 0 04279 * "1100101".to_i(2) #=> 101 04280 * "1100101".to_i(8) #=> 294977 04281 * "1100101".to_i(10) #=> 1100101 04282 * "1100101".to_i(16) #=> 17826049 04283 */ 04284 04285 static VALUE 04286 rb_str_to_i(int argc, VALUE *argv, VALUE str) 04287 { 04288 int base; 04289 04290 if (argc == 0) base = 10; 04291 else { 04292 VALUE b; 04293 04294 rb_scan_args(argc, argv, "01", &b); 04295 base = NUM2INT(b); 04296 } 04297 if (base < 0) { 04298 rb_raise(rb_eArgError, "invalid radix %d", base); 04299 } 04300 return rb_str_to_inum(str, base, FALSE); 04301 } 04302 04303 04304 /* 04305 * call-seq: 04306 * str.to_f -> float 04307 * 04308 * Returns the result of interpreting leading characters in <i>str</i> as a 04309 * floating point number. Extraneous characters past the end of a valid number 04310 * are ignored. If there is not a valid number at the start of <i>str</i>, 04311 * <code>0.0</code> is returned. This method never raises an exception. 04312 * 04313 * "123.45e1".to_f #=> 1234.5 04314 * "45.67 degrees".to_f #=> 45.67 04315 * "thx1138".to_f #=> 0.0 04316 */ 04317 04318 static VALUE 04319 rb_str_to_f(VALUE str) 04320 { 04321 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 04322 } 04323 04324 04325 /* 04326 * call-seq: 04327 * str.to_s -> str 04328 * str.to_str -> str 04329 * 04330 * Returns the receiver. 04331 */ 04332 04333 static VALUE 04334 rb_str_to_s(VALUE str) 04335 { 04336 if (rb_obj_class(str) != rb_cString) { 04337 return str_duplicate(rb_cString, str); 04338 } 04339 return str; 04340 } 04341 04342 #if 0 04343 static void 04344 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 04345 { 04346 char s[RUBY_MAX_CHAR_LEN]; 04347 int n = rb_enc_codelen(c, enc); 04348 04349 rb_enc_mbcput(c, s, enc); 04350 rb_enc_str_buf_cat(str, s, n, enc); 04351 } 04352 #endif 04353 04354 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 04355 04356 int 04357 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 04358 { 04359 char buf[CHAR_ESC_LEN + 1]; 04360 int l; 04361 04362 #if SIZEOF_INT > 4 04363 c &= 0xffffffff; 04364 #endif 04365 if (unicode_p) { 04366 if (c < 0x7F && ISPRINT(c)) { 04367 snprintf(buf, CHAR_ESC_LEN, "%c", c); 04368 } 04369 else if (c < 0x10000) { 04370 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 04371 } 04372 else { 04373 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 04374 } 04375 } 04376 else { 04377 if (c < 0x100) { 04378 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 04379 } 04380 else { 04381 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 04382 } 04383 } 04384 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 04385 rb_str_buf_cat(result, buf, l); 04386 return l; 04387 } 04388 04389 /* 04390 * call-seq: 04391 * str.inspect -> string 04392 * 04393 * Returns a printable version of _str_, surrounded by quote marks, 04394 * with special characters escaped. 04395 * 04396 * str = "hello" 04397 * str[3] = "\b" 04398 * str.inspect #=> "\"hel\\bo\"" 04399 */ 04400 04401 VALUE 04402 rb_str_inspect(VALUE str) 04403 { 04404 rb_encoding *enc = STR_ENC_GET(str); 04405 const char *p, *pend, *prev; 04406 char buf[CHAR_ESC_LEN + 1]; 04407 VALUE result = rb_str_buf_new(0); 04408 rb_encoding *resenc = rb_default_internal_encoding(); 04409 int unicode_p = rb_enc_unicode_p(enc); 04410 int asciicompat = rb_enc_asciicompat(enc); 04411 static rb_encoding *utf16, *utf32; 04412 04413 if (!utf16) utf16 = rb_enc_find("UTF-16"); 04414 if (!utf32) utf32 = rb_enc_find("UTF-32"); 04415 if (resenc == NULL) resenc = rb_default_external_encoding(); 04416 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 04417 rb_enc_associate(result, resenc); 04418 str_buf_cat2(result, "\""); 04419 04420 p = RSTRING_PTR(str); pend = RSTRING_END(str); 04421 prev = p; 04422 if (enc == utf16) { 04423 const unsigned char *q = (const unsigned char *)p; 04424 if (q[0] == 0xFE && q[1] == 0xFF) 04425 enc = rb_enc_find("UTF-16BE"); 04426 else if (q[0] == 0xFF && q[1] == 0xFE) 04427 enc = rb_enc_find("UTF-16LE"); 04428 else 04429 unicode_p = 0; 04430 } 04431 else if (enc == utf32) { 04432 const unsigned char *q = (const unsigned char *)p; 04433 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 04434 enc = rb_enc_find("UTF-32BE"); 04435 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 04436 enc = rb_enc_find("UTF-32LE"); 04437 else 04438 unicode_p = 0; 04439 } 04440 while (p < pend) { 04441 unsigned int c, cc; 04442 int n; 04443 04444 n = rb_enc_precise_mbclen(p, pend, enc); 04445 if (!MBCLEN_CHARFOUND_P(n)) { 04446 if (p > prev) str_buf_cat(result, prev, p - prev); 04447 n = rb_enc_mbminlen(enc); 04448 if (pend < p + n) 04449 n = (int)(pend - p); 04450 while (n--) { 04451 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 04452 str_buf_cat(result, buf, strlen(buf)); 04453 prev = ++p; 04454 } 04455 continue; 04456 } 04457 n = MBCLEN_CHARFOUND_LEN(n); 04458 c = rb_enc_mbc_to_codepoint(p, pend, enc); 04459 p += n; 04460 if ((asciicompat || unicode_p) && 04461 (c == '"'|| c == '\\' || 04462 (c == '#' && 04463 p < pend && 04464 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 04465 (cc = rb_enc_codepoint(p,pend,enc), 04466 (cc == '$' || cc == '@' || cc == '{'))))) { 04467 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04468 str_buf_cat2(result, "\\"); 04469 if (asciicompat || enc == resenc) { 04470 prev = p - n; 04471 continue; 04472 } 04473 } 04474 switch (c) { 04475 case '\n': cc = 'n'; break; 04476 case '\r': cc = 'r'; break; 04477 case '\t': cc = 't'; break; 04478 case '\f': cc = 'f'; break; 04479 case '\013': cc = 'v'; break; 04480 case '\010': cc = 'b'; break; 04481 case '\007': cc = 'a'; break; 04482 case 033: cc = 'e'; break; 04483 default: cc = 0; break; 04484 } 04485 if (cc) { 04486 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04487 buf[0] = '\\'; 04488 buf[1] = (char)cc; 04489 str_buf_cat(result, buf, 2); 04490 prev = p; 04491 continue; 04492 } 04493 if ((enc == resenc && rb_enc_isprint(c, enc)) || 04494 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 04495 continue; 04496 } 04497 else { 04498 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04499 rb_str_buf_cat_escaped_char(result, c, unicode_p); 04500 prev = p; 04501 continue; 04502 } 04503 } 04504 if (p > prev) str_buf_cat(result, prev, p - prev); 04505 str_buf_cat2(result, "\""); 04506 04507 OBJ_INFECT(result, str); 04508 return result; 04509 } 04510 04511 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 04512 04513 /* 04514 * call-seq: 04515 * str.dump -> new_str 04516 * 04517 * Produces a version of <i>str</i> with all nonprinting characters replaced by 04518 * <code>\nnn</code> notation and all special characters escaped. 04519 */ 04520 04521 VALUE 04522 rb_str_dump(VALUE str) 04523 { 04524 rb_encoding *enc = rb_enc_get(str); 04525 long len; 04526 const char *p, *pend; 04527 char *q, *qend; 04528 VALUE result; 04529 int u8 = (enc == rb_utf8_encoding()); 04530 04531 len = 2; /* "" */ 04532 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04533 while (p < pend) { 04534 unsigned char c = *p++; 04535 switch (c) { 04536 case '"': case '\\': 04537 case '\n': case '\r': 04538 case '\t': case '\f': 04539 case '\013': case '\010': case '\007': case '\033': 04540 len += 2; 04541 break; 04542 04543 case '#': 04544 len += IS_EVSTR(p, pend) ? 2 : 1; 04545 break; 04546 04547 default: 04548 if (ISPRINT(c)) { 04549 len++; 04550 } 04551 else { 04552 if (u8) { /* \u{NN} */ 04553 int n = rb_enc_precise_mbclen(p-1, pend, enc); 04554 if (MBCLEN_CHARFOUND_P(n-1)) { 04555 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04556 while (cc >>= 4) len++; 04557 len += 5; 04558 p += MBCLEN_CHARFOUND_LEN(n)-1; 04559 break; 04560 } 04561 } 04562 len += 4; /* \xNN */ 04563 } 04564 break; 04565 } 04566 } 04567 if (!rb_enc_asciicompat(enc)) { 04568 len += 19; /* ".force_encoding('')" */ 04569 len += strlen(enc->name); 04570 } 04571 04572 result = rb_str_new5(str, 0, len); 04573 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04574 q = RSTRING_PTR(result); qend = q + len + 1; 04575 04576 *q++ = '"'; 04577 while (p < pend) { 04578 unsigned char c = *p++; 04579 04580 if (c == '"' || c == '\\') { 04581 *q++ = '\\'; 04582 *q++ = c; 04583 } 04584 else if (c == '#') { 04585 if (IS_EVSTR(p, pend)) *q++ = '\\'; 04586 *q++ = '#'; 04587 } 04588 else if (c == '\n') { 04589 *q++ = '\\'; 04590 *q++ = 'n'; 04591 } 04592 else if (c == '\r') { 04593 *q++ = '\\'; 04594 *q++ = 'r'; 04595 } 04596 else if (c == '\t') { 04597 *q++ = '\\'; 04598 *q++ = 't'; 04599 } 04600 else if (c == '\f') { 04601 *q++ = '\\'; 04602 *q++ = 'f'; 04603 } 04604 else if (c == '\013') { 04605 *q++ = '\\'; 04606 *q++ = 'v'; 04607 } 04608 else if (c == '\010') { 04609 *q++ = '\\'; 04610 *q++ = 'b'; 04611 } 04612 else if (c == '\007') { 04613 *q++ = '\\'; 04614 *q++ = 'a'; 04615 } 04616 else if (c == '\033') { 04617 *q++ = '\\'; 04618 *q++ = 'e'; 04619 } 04620 else if (ISPRINT(c)) { 04621 *q++ = c; 04622 } 04623 else { 04624 *q++ = '\\'; 04625 if (u8) { 04626 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 04627 if (MBCLEN_CHARFOUND_P(n)) { 04628 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04629 p += n; 04630 snprintf(q, qend-q, "u{%x}", cc); 04631 q += strlen(q); 04632 continue; 04633 } 04634 } 04635 snprintf(q, qend-q, "x%02X", c); 04636 q += 3; 04637 } 04638 } 04639 *q++ = '"'; 04640 *q = '\0'; 04641 if (!rb_enc_asciicompat(enc)) { 04642 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 04643 enc = rb_ascii8bit_encoding(); 04644 } 04645 OBJ_INFECT(result, str); 04646 /* result from dump is ASCII */ 04647 rb_enc_associate(result, enc); 04648 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 04649 return result; 04650 } 04651 04652 04653 static void 04654 rb_str_check_dummy_enc(rb_encoding *enc) 04655 { 04656 if (rb_enc_dummy_p(enc)) { 04657 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 04658 rb_enc_name(enc)); 04659 } 04660 } 04661 04662 /* 04663 * call-seq: 04664 * str.upcase! -> str or nil 04665 * 04666 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 04667 * were made. 04668 * Note: case replacement is effective only in ASCII region. 04669 */ 04670 04671 static VALUE 04672 rb_str_upcase_bang(VALUE str) 04673 { 04674 rb_encoding *enc; 04675 char *s, *send; 04676 int modify = 0; 04677 int n; 04678 04679 str_modify_keep_cr(str); 04680 enc = STR_ENC_GET(str); 04681 rb_str_check_dummy_enc(enc); 04682 s = RSTRING_PTR(str); send = RSTRING_END(str); 04683 if (single_byte_optimizable(str)) { 04684 while (s < send) { 04685 unsigned int c = *(unsigned char*)s; 04686 04687 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04688 *s = 'A' + (c - 'a'); 04689 modify = 1; 04690 } 04691 s++; 04692 } 04693 } 04694 else { 04695 int ascompat = rb_enc_asciicompat(enc); 04696 04697 while (s < send) { 04698 unsigned int c; 04699 04700 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04701 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04702 *s = 'A' + (c - 'a'); 04703 modify = 1; 04704 } 04705 s++; 04706 } 04707 else { 04708 c = rb_enc_codepoint_len(s, send, &n, enc); 04709 if (rb_enc_islower(c, enc)) { 04710 /* assuming toupper returns codepoint with same size */ 04711 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04712 modify = 1; 04713 } 04714 s += n; 04715 } 04716 } 04717 } 04718 04719 if (modify) return str; 04720 return Qnil; 04721 } 04722 04723 04724 /* 04725 * call-seq: 04726 * str.upcase -> new_str 04727 * 04728 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 04729 * uppercase counterparts. The operation is locale insensitive---only 04730 * characters ``a'' to ``z'' are affected. 04731 * Note: case replacement is effective only in ASCII region. 04732 * 04733 * "hEllO".upcase #=> "HELLO" 04734 */ 04735 04736 static VALUE 04737 rb_str_upcase(VALUE str) 04738 { 04739 str = rb_str_dup(str); 04740 rb_str_upcase_bang(str); 04741 return str; 04742 } 04743 04744 04745 /* 04746 * call-seq: 04747 * str.downcase! -> str or nil 04748 * 04749 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 04750 * changes were made. 04751 * Note: case replacement is effective only in ASCII region. 04752 */ 04753 04754 static VALUE 04755 rb_str_downcase_bang(VALUE str) 04756 { 04757 rb_encoding *enc; 04758 char *s, *send; 04759 int modify = 0; 04760 04761 str_modify_keep_cr(str); 04762 enc = STR_ENC_GET(str); 04763 rb_str_check_dummy_enc(enc); 04764 s = RSTRING_PTR(str); send = RSTRING_END(str); 04765 if (single_byte_optimizable(str)) { 04766 while (s < send) { 04767 unsigned int c = *(unsigned char*)s; 04768 04769 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04770 *s = 'a' + (c - 'A'); 04771 modify = 1; 04772 } 04773 s++; 04774 } 04775 } 04776 else { 04777 int ascompat = rb_enc_asciicompat(enc); 04778 04779 while (s < send) { 04780 unsigned int c; 04781 int n; 04782 04783 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04784 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04785 *s = 'a' + (c - 'A'); 04786 modify = 1; 04787 } 04788 s++; 04789 } 04790 else { 04791 c = rb_enc_codepoint_len(s, send, &n, enc); 04792 if (rb_enc_isupper(c, enc)) { 04793 /* assuming toupper returns codepoint with same size */ 04794 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04795 modify = 1; 04796 } 04797 s += n; 04798 } 04799 } 04800 } 04801 04802 if (modify) return str; 04803 return Qnil; 04804 } 04805 04806 04807 /* 04808 * call-seq: 04809 * str.downcase -> new_str 04810 * 04811 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 04812 * lowercase counterparts. The operation is locale insensitive---only 04813 * characters ``A'' to ``Z'' are affected. 04814 * Note: case replacement is effective only in ASCII region. 04815 * 04816 * "hEllO".downcase #=> "hello" 04817 */ 04818 04819 static VALUE 04820 rb_str_downcase(VALUE str) 04821 { 04822 str = rb_str_dup(str); 04823 rb_str_downcase_bang(str); 04824 return str; 04825 } 04826 04827 04828 /* 04829 * call-seq: 04830 * str.capitalize! -> str or nil 04831 * 04832 * Modifies <i>str</i> by converting the first character to uppercase and the 04833 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 04834 * Note: case conversion is effective only in ASCII region. 04835 * 04836 * a = "hello" 04837 * a.capitalize! #=> "Hello" 04838 * a #=> "Hello" 04839 * a.capitalize! #=> nil 04840 */ 04841 04842 static VALUE 04843 rb_str_capitalize_bang(VALUE str) 04844 { 04845 rb_encoding *enc; 04846 char *s, *send; 04847 int modify = 0; 04848 unsigned int c; 04849 int n; 04850 04851 str_modify_keep_cr(str); 04852 enc = STR_ENC_GET(str); 04853 rb_str_check_dummy_enc(enc); 04854 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 04855 s = RSTRING_PTR(str); send = RSTRING_END(str); 04856 04857 c = rb_enc_codepoint_len(s, send, &n, enc); 04858 if (rb_enc_islower(c, enc)) { 04859 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04860 modify = 1; 04861 } 04862 s += n; 04863 while (s < send) { 04864 c = rb_enc_codepoint_len(s, send, &n, enc); 04865 if (rb_enc_isupper(c, enc)) { 04866 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04867 modify = 1; 04868 } 04869 s += n; 04870 } 04871 04872 if (modify) return str; 04873 return Qnil; 04874 } 04875 04876 04877 /* 04878 * call-seq: 04879 * str.capitalize -> new_str 04880 * 04881 * Returns a copy of <i>str</i> with the first character converted to uppercase 04882 * and the remainder to lowercase. 04883 * Note: case conversion is effective only in ASCII region. 04884 * 04885 * "hello".capitalize #=> "Hello" 04886 * "HELLO".capitalize #=> "Hello" 04887 * "123ABC".capitalize #=> "123abc" 04888 */ 04889 04890 static VALUE 04891 rb_str_capitalize(VALUE str) 04892 { 04893 str = rb_str_dup(str); 04894 rb_str_capitalize_bang(str); 04895 return str; 04896 } 04897 04898 04899 /* 04900 * call-seq: 04901 * str.swapcase! -> str or nil 04902 * 04903 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 04904 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 04905 * Note: case conversion is effective only in ASCII region. 04906 */ 04907 04908 static VALUE 04909 rb_str_swapcase_bang(VALUE str) 04910 { 04911 rb_encoding *enc; 04912 char *s, *send; 04913 int modify = 0; 04914 int n; 04915 04916 str_modify_keep_cr(str); 04917 enc = STR_ENC_GET(str); 04918 rb_str_check_dummy_enc(enc); 04919 s = RSTRING_PTR(str); send = RSTRING_END(str); 04920 while (s < send) { 04921 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 04922 04923 if (rb_enc_isupper(c, enc)) { 04924 /* assuming toupper returns codepoint with same size */ 04925 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04926 modify = 1; 04927 } 04928 else if (rb_enc_islower(c, enc)) { 04929 /* assuming tolower returns codepoint with same size */ 04930 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04931 modify = 1; 04932 } 04933 s += n; 04934 } 04935 04936 if (modify) return str; 04937 return Qnil; 04938 } 04939 04940 04941 /* 04942 * call-seq: 04943 * str.swapcase -> new_str 04944 * 04945 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 04946 * to lowercase and lowercase characters converted to uppercase. 04947 * Note: case conversion is effective only in ASCII region. 04948 * 04949 * "Hello".swapcase #=> "hELLO" 04950 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 04951 */ 04952 04953 static VALUE 04954 rb_str_swapcase(VALUE str) 04955 { 04956 str = rb_str_dup(str); 04957 rb_str_swapcase_bang(str); 04958 return str; 04959 } 04960 04961 typedef unsigned char *USTR; 04962 04963 struct tr { 04964 int gen; 04965 unsigned int now, max; 04966 char *p, *pend; 04967 }; 04968 04969 static unsigned int 04970 trnext(struct tr *t, rb_encoding *enc) 04971 { 04972 int n; 04973 04974 for (;;) { 04975 if (!t->gen) { 04976 if (t->p == t->pend) return -1; 04977 if (t->p < t->pend - 1 && *t->p == '\\') { 04978 t->p++; 04979 } 04980 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04981 t->p += n; 04982 if (t->p < t->pend - 1 && *t->p == '-') { 04983 t->p++; 04984 if (t->p < t->pend) { 04985 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04986 t->p += n; 04987 if (t->now > c) { 04988 if (t->now < 0x80 && c < 0x80) { 04989 rb_raise(rb_eArgError, 04990 "invalid range \"%c-%c\" in string transliteration", 04991 t->now, c); 04992 } 04993 else { 04994 rb_raise(rb_eArgError, "invalid range in string transliteration"); 04995 } 04996 continue; /* not reached */ 04997 } 04998 t->gen = 1; 04999 t->max = c; 05000 } 05001 } 05002 return t->now; 05003 } 05004 else if (++t->now < t->max) { 05005 return t->now; 05006 } 05007 else { 05008 t->gen = 0; 05009 return t->max; 05010 } 05011 } 05012 } 05013 05014 static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 05015 05016 static VALUE 05017 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 05018 { 05019 const unsigned int errc = -1; 05020 unsigned int trans[256]; 05021 rb_encoding *enc, *e1, *e2; 05022 struct tr trsrc, trrepl; 05023 int cflag = 0; 05024 unsigned int c, c0, last = 0; 05025 int modify = 0, i, l; 05026 char *s, *send; 05027 VALUE hash = 0; 05028 int singlebyte = single_byte_optimizable(str); 05029 int cr; 05030 05031 #define CHECK_IF_ASCII(c) \ 05032 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 05033 (cr = ENC_CODERANGE_VALID) : 0) 05034 05035 StringValue(src); 05036 StringValue(repl); 05037 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05038 if (RSTRING_LEN(repl) == 0) { 05039 return rb_str_delete_bang(1, &src, str); 05040 } 05041 05042 cr = ENC_CODERANGE(str); 05043 e1 = rb_enc_check(str, src); 05044 e2 = rb_enc_check(str, repl); 05045 if (e1 == e2) { 05046 enc = e1; 05047 } 05048 else { 05049 enc = rb_enc_check(src, repl); 05050 } 05051 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 05052 if (RSTRING_LEN(src) > 1 && 05053 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 05054 trsrc.p + l < trsrc.pend) { 05055 cflag = 1; 05056 trsrc.p += l; 05057 } 05058 trrepl.p = RSTRING_PTR(repl); 05059 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 05060 trsrc.gen = trrepl.gen = 0; 05061 trsrc.now = trrepl.now = 0; 05062 trsrc.max = trrepl.max = 0; 05063 05064 if (cflag) { 05065 for (i=0; i<256; i++) { 05066 trans[i] = 1; 05067 } 05068 while ((c = trnext(&trsrc, enc)) != errc) { 05069 if (c < 256) { 05070 trans[c] = errc; 05071 } 05072 else { 05073 if (!hash) hash = rb_hash_new(); 05074 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 05075 } 05076 } 05077 while ((c = trnext(&trrepl, enc)) != errc) 05078 /* retrieve last replacer */; 05079 last = trrepl.now; 05080 for (i=0; i<256; i++) { 05081 if (trans[i] != errc) { 05082 trans[i] = last; 05083 } 05084 } 05085 } 05086 else { 05087 unsigned int r; 05088 05089 for (i=0; i<256; i++) { 05090 trans[i] = errc; 05091 } 05092 while ((c = trnext(&trsrc, enc)) != errc) { 05093 r = trnext(&trrepl, enc); 05094 if (r == errc) r = trrepl.now; 05095 if (c < 256) { 05096 trans[c] = r; 05097 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 05098 } 05099 else { 05100 if (!hash) hash = rb_hash_new(); 05101 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 05102 } 05103 } 05104 } 05105 05106 if (cr == ENC_CODERANGE_VALID) 05107 cr = ENC_CODERANGE_7BIT; 05108 str_modify_keep_cr(str); 05109 s = RSTRING_PTR(str); send = RSTRING_END(str); 05110 if (sflag) { 05111 int clen, tlen; 05112 long offset, max = RSTRING_LEN(str); 05113 unsigned int save = -1; 05114 char *buf = ALLOC_N(char, max), *t = buf; 05115 05116 while (s < send) { 05117 int may_modify = 0; 05118 05119 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05120 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05121 05122 s += clen; 05123 if (c < 256) { 05124 c = trans[c]; 05125 } 05126 else if (hash) { 05127 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05128 if (NIL_P(tmp)) { 05129 if (cflag) c = last; 05130 else c = errc; 05131 } 05132 else if (cflag) c = errc; 05133 else c = NUM2INT(tmp); 05134 } 05135 else { 05136 c = errc; 05137 } 05138 if (c != (unsigned int)-1) { 05139 if (save == c) { 05140 CHECK_IF_ASCII(c); 05141 continue; 05142 } 05143 save = c; 05144 tlen = rb_enc_codelen(c, enc); 05145 modify = 1; 05146 } 05147 else { 05148 save = -1; 05149 c = c0; 05150 if (enc != e1) may_modify = 1; 05151 } 05152 while (t - buf + tlen >= max) { 05153 offset = t - buf; 05154 max *= 2; 05155 REALLOC_N(buf, char, max); 05156 t = buf + offset; 05157 } 05158 rb_enc_mbcput(c, t, enc); 05159 if (may_modify && memcmp(s, t, tlen) != 0) { 05160 modify = 1; 05161 } 05162 CHECK_IF_ASCII(c); 05163 t += tlen; 05164 } 05165 if (!STR_EMBED_P(str)) { 05166 xfree(RSTRING(str)->as.heap.ptr); 05167 } 05168 *t = '\0'; 05169 RSTRING(str)->as.heap.ptr = buf; 05170 RSTRING(str)->as.heap.len = t - buf; 05171 STR_SET_NOEMBED(str); 05172 RSTRING(str)->as.heap.aux.capa = max; 05173 } 05174 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 05175 while (s < send) { 05176 c = (unsigned char)*s; 05177 if (trans[c] != errc) { 05178 if (!cflag) { 05179 c = trans[c]; 05180 *s = c; 05181 modify = 1; 05182 } 05183 else { 05184 *s = last; 05185 modify = 1; 05186 } 05187 } 05188 CHECK_IF_ASCII(c); 05189 s++; 05190 } 05191 } 05192 else { 05193 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 05194 long offset; 05195 char *buf = ALLOC_N(char, max), *t = buf; 05196 05197 while (s < send) { 05198 int may_modify = 0; 05199 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05200 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05201 05202 if (c < 256) { 05203 c = trans[c]; 05204 } 05205 else if (hash) { 05206 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05207 if (NIL_P(tmp)) { 05208 if (cflag) c = last; 05209 else c = errc; 05210 } 05211 else if (cflag) c = errc; 05212 else c = NUM2INT(tmp); 05213 } 05214 else { 05215 c = cflag ? last : errc; 05216 } 05217 if (c != errc) { 05218 tlen = rb_enc_codelen(c, enc); 05219 modify = 1; 05220 } 05221 else { 05222 c = c0; 05223 if (enc != e1) may_modify = 1; 05224 } 05225 while (t - buf + tlen >= max) { 05226 offset = t - buf; 05227 max *= 2; 05228 REALLOC_N(buf, char, max); 05229 t = buf + offset; 05230 } 05231 if (s != t) { 05232 rb_enc_mbcput(c, t, enc); 05233 if (may_modify && memcmp(s, t, tlen) != 0) { 05234 modify = 1; 05235 } 05236 } 05237 CHECK_IF_ASCII(c); 05238 s += clen; 05239 t += tlen; 05240 } 05241 if (!STR_EMBED_P(str)) { 05242 xfree(RSTRING(str)->as.heap.ptr); 05243 } 05244 *t = '\0'; 05245 RSTRING(str)->as.heap.ptr = buf; 05246 RSTRING(str)->as.heap.len = t - buf; 05247 STR_SET_NOEMBED(str); 05248 RSTRING(str)->as.heap.aux.capa = max; 05249 } 05250 05251 if (modify) { 05252 if (cr != ENC_CODERANGE_BROKEN) 05253 ENC_CODERANGE_SET(str, cr); 05254 rb_enc_associate(str, enc); 05255 return str; 05256 } 05257 return Qnil; 05258 } 05259 05260 05261 /* 05262 * call-seq: 05263 * str.tr!(from_str, to_str) -> str or nil 05264 * 05265 * Translates <i>str</i> in place, using the same rules as 05266 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 05267 * changes were made. 05268 */ 05269 05270 static VALUE 05271 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 05272 { 05273 return tr_trans(str, src, repl, 0); 05274 } 05275 05276 05277 /* 05278 * call-seq: 05279 * str.tr(from_str, to_str) => new_str 05280 * 05281 * Returns a copy of <i>str</i> with the characters in <i>from_str</i> 05282 * replaced by the corresponding characters in <i>to_str</i>. If 05283 * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last 05284 * character in order to maintain the correspondence. 05285 * 05286 * "hello".tr('el', 'ip') #=> "hippo" 05287 * "hello".tr('aeiou', '*') #=> "h*ll*" 05288 * 05289 * Both strings may use the c1-c2 notation to denote ranges of characters, 05290 * and <i>from_str</i> may start with a <code>^</code>, which denotes all 05291 * characters except those listed. 05292 * 05293 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 05294 * "hello".tr('^aeiou', '*') #=> "*e**o" 05295 */ 05296 05297 static VALUE 05298 rb_str_tr(VALUE str, VALUE src, VALUE repl) 05299 { 05300 str = rb_str_dup(str); 05301 tr_trans(str, src, repl, 0); 05302 return str; 05303 } 05304 05305 #define TR_TABLE_SIZE 257 05306 static void 05307 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 05308 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 05309 { 05310 const unsigned int errc = -1; 05311 char buf[256]; 05312 struct tr tr; 05313 unsigned int c; 05314 VALUE table = 0, ptable = 0; 05315 int i, l, cflag = 0; 05316 05317 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 05318 tr.gen = tr.now = tr.max = 0; 05319 05320 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 05321 cflag = 1; 05322 tr.p += l; 05323 } 05324 if (first) { 05325 for (i=0; i<256; i++) { 05326 stable[i] = 1; 05327 } 05328 stable[256] = cflag; 05329 } 05330 else if (stable[256] && !cflag) { 05331 stable[256] = 0; 05332 } 05333 for (i=0; i<256; i++) { 05334 buf[i] = cflag; 05335 } 05336 05337 while ((c = trnext(&tr, enc)) != errc) { 05338 if (c < 256) { 05339 buf[c & 0xff] = !cflag; 05340 } 05341 else { 05342 VALUE key = UINT2NUM(c); 05343 05344 if (!table) { 05345 table = rb_hash_new(); 05346 if (cflag) { 05347 ptable = *ctablep; 05348 *ctablep = table; 05349 } 05350 else { 05351 ptable = *tablep; 05352 *tablep = table; 05353 } 05354 } 05355 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) { 05356 rb_hash_aset(table, key, Qtrue); 05357 } 05358 } 05359 } 05360 for (i=0; i<256; i++) { 05361 stable[i] = stable[i] && buf[i]; 05362 } 05363 } 05364 05365 05366 static int 05367 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 05368 { 05369 if (c < 256) { 05370 return table[c] != 0; 05371 } 05372 else { 05373 VALUE v = UINT2NUM(c); 05374 05375 if (del) { 05376 if (!NIL_P(rb_hash_lookup(del, v)) && 05377 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 05378 return TRUE; 05379 } 05380 } 05381 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 05382 return FALSE; 05383 } 05384 return table[256] ? TRUE : FALSE; 05385 } 05386 } 05387 05388 /* 05389 * call-seq: 05390 * str.delete!([other_str]+) -> str or nil 05391 * 05392 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 05393 * <code>nil</code> if <i>str</i> was not modified. 05394 */ 05395 05396 static VALUE 05397 rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 05398 { 05399 char squeez[TR_TABLE_SIZE]; 05400 rb_encoding *enc = 0; 05401 char *s, *send, *t; 05402 VALUE del = 0, nodel = 0; 05403 int modify = 0; 05404 int i, ascompat, cr; 05405 05406 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05407 if (argc < 1) { 05408 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05409 } 05410 for (i=0; i<argc; i++) { 05411 VALUE s = argv[i]; 05412 05413 StringValue(s); 05414 enc = rb_enc_check(str, s); 05415 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05416 } 05417 05418 str_modify_keep_cr(str); 05419 ascompat = rb_enc_asciicompat(enc); 05420 s = t = RSTRING_PTR(str); 05421 send = RSTRING_END(str); 05422 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 05423 while (s < send) { 05424 unsigned int c; 05425 int clen; 05426 05427 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05428 if (squeez[c]) { 05429 modify = 1; 05430 } 05431 else { 05432 if (t != s) *t = c; 05433 t++; 05434 } 05435 s++; 05436 } 05437 else { 05438 c = rb_enc_codepoint_len(s, send, &clen, enc); 05439 05440 if (tr_find(c, squeez, del, nodel)) { 05441 modify = 1; 05442 } 05443 else { 05444 if (t != s) rb_enc_mbcput(c, t, enc); 05445 t += clen; 05446 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 05447 } 05448 s += clen; 05449 } 05450 } 05451 *t = '\0'; 05452 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05453 ENC_CODERANGE_SET(str, cr); 05454 05455 if (modify) return str; 05456 return Qnil; 05457 } 05458 05459 05460 /* 05461 * call-seq: 05462 * str.delete([other_str]+) -> new_str 05463 * 05464 * Returns a copy of <i>str</i> with all characters in the intersection of its 05465 * arguments deleted. Uses the same rules for building the set of characters as 05466 * <code>String#count</code>. 05467 * 05468 * "hello".delete "l","lo" #=> "heo" 05469 * "hello".delete "lo" #=> "he" 05470 * "hello".delete "aeiou", "^e" #=> "hell" 05471 * "hello".delete "ej-m" #=> "ho" 05472 */ 05473 05474 static VALUE 05475 rb_str_delete(int argc, VALUE *argv, VALUE str) 05476 { 05477 str = rb_str_dup(str); 05478 rb_str_delete_bang(argc, argv, str); 05479 return str; 05480 } 05481 05482 05483 /* 05484 * call-seq: 05485 * str.squeeze!([other_str]*) -> str or nil 05486 * 05487 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 05488 * <code>nil</code> if no changes were made. 05489 */ 05490 05491 static VALUE 05492 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 05493 { 05494 char squeez[TR_TABLE_SIZE]; 05495 rb_encoding *enc = 0; 05496 VALUE del = 0, nodel = 0; 05497 char *s, *send, *t; 05498 int i, modify = 0; 05499 int ascompat, singlebyte = single_byte_optimizable(str); 05500 unsigned int save; 05501 05502 if (argc == 0) { 05503 enc = STR_ENC_GET(str); 05504 } 05505 else { 05506 for (i=0; i<argc; i++) { 05507 VALUE s = argv[i]; 05508 05509 StringValue(s); 05510 enc = rb_enc_check(str, s); 05511 if (singlebyte && !single_byte_optimizable(s)) 05512 singlebyte = 0; 05513 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05514 } 05515 } 05516 05517 str_modify_keep_cr(str); 05518 s = t = RSTRING_PTR(str); 05519 if (!s || RSTRING_LEN(str) == 0) return Qnil; 05520 send = RSTRING_END(str); 05521 save = -1; 05522 ascompat = rb_enc_asciicompat(enc); 05523 05524 if (singlebyte) { 05525 while (s < send) { 05526 unsigned int c = *(unsigned char*)s++; 05527 if (c != save || (argc > 0 && !squeez[c])) { 05528 *t++ = save = c; 05529 } 05530 } 05531 } else { 05532 while (s < send) { 05533 unsigned int c; 05534 int clen; 05535 05536 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05537 if (c != save || (argc > 0 && !squeez[c])) { 05538 *t++ = save = c; 05539 } 05540 s++; 05541 } 05542 else { 05543 c = rb_enc_codepoint_len(s, send, &clen, enc); 05544 05545 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 05546 if (t != s) rb_enc_mbcput(c, t, enc); 05547 save = c; 05548 t += clen; 05549 } 05550 s += clen; 05551 } 05552 } 05553 } 05554 05555 *t = '\0'; 05556 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 05557 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05558 modify = 1; 05559 } 05560 05561 if (modify) return str; 05562 return Qnil; 05563 } 05564 05565 05566 /* 05567 * call-seq: 05568 * str.squeeze([other_str]*) -> new_str 05569 * 05570 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 05571 * procedure described for <code>String#count</code>. Returns a new string 05572 * where runs of the same character that occur in this set are replaced by a 05573 * single character. If no arguments are given, all runs of identical 05574 * characters are replaced by a single character. 05575 * 05576 * "yellow moon".squeeze #=> "yelow mon" 05577 * " now is the".squeeze(" ") #=> " now is the" 05578 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 05579 */ 05580 05581 static VALUE 05582 rb_str_squeeze(int argc, VALUE *argv, VALUE str) 05583 { 05584 str = rb_str_dup(str); 05585 rb_str_squeeze_bang(argc, argv, str); 05586 return str; 05587 } 05588 05589 05590 /* 05591 * call-seq: 05592 * str.tr_s!(from_str, to_str) -> str or nil 05593 * 05594 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 05595 * returning <i>str</i>, or <code>nil</code> if no changes were made. 05596 */ 05597 05598 static VALUE 05599 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 05600 { 05601 return tr_trans(str, src, repl, 1); 05602 } 05603 05604 05605 /* 05606 * call-seq: 05607 * str.tr_s(from_str, to_str) -> new_str 05608 * 05609 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 05610 * then removes duplicate characters in regions that were affected by the 05611 * translation. 05612 * 05613 * "hello".tr_s('l', 'r') #=> "hero" 05614 * "hello".tr_s('el', '*') #=> "h*o" 05615 * "hello".tr_s('el', 'hx') #=> "hhxo" 05616 */ 05617 05618 static VALUE 05619 rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 05620 { 05621 str = rb_str_dup(str); 05622 tr_trans(str, src, repl, 1); 05623 return str; 05624 } 05625 05626 05627 /* 05628 * call-seq: 05629 * str.count([other_str]+) -> fixnum 05630 * 05631 * Each <i>other_str</i> parameter defines a set of characters to count. The 05632 * intersection of these sets defines the characters to count in 05633 * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is 05634 * negated. The sequence c1--c2 means all characters between c1 and c2. 05635 * 05636 * a = "hello world" 05637 * a.count "lo" #=> 5 05638 * a.count "lo", "o" #=> 2 05639 * a.count "hello", "^l" #=> 4 05640 * a.count "ej-m" #=> 4 05641 */ 05642 05643 static VALUE 05644 rb_str_count(int argc, VALUE *argv, VALUE str) 05645 { 05646 char table[TR_TABLE_SIZE]; 05647 rb_encoding *enc = 0; 05648 VALUE del = 0, nodel = 0; 05649 char *s, *send; 05650 int i; 05651 int ascompat; 05652 05653 if (argc < 1) { 05654 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05655 } 05656 for (i=0; i<argc; i++) { 05657 VALUE tstr = argv[i]; 05658 unsigned char c; 05659 05660 StringValue(tstr); 05661 enc = rb_enc_check(str, tstr); 05662 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 05663 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 05664 int n = 0; 05665 05666 s = RSTRING_PTR(str); 05667 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05668 send = RSTRING_END(str); 05669 while (s < send) { 05670 if (*(unsigned char*)s++ == c) n++; 05671 } 05672 return INT2NUM(n); 05673 } 05674 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 05675 } 05676 05677 s = RSTRING_PTR(str); 05678 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05679 send = RSTRING_END(str); 05680 ascompat = rb_enc_asciicompat(enc); 05681 i = 0; 05682 while (s < send) { 05683 unsigned int c; 05684 05685 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05686 if (table[c]) { 05687 i++; 05688 } 05689 s++; 05690 } 05691 else { 05692 int clen; 05693 c = rb_enc_codepoint_len(s, send, &clen, enc); 05694 if (tr_find(c, table, del, nodel)) { 05695 i++; 05696 } 05697 s += clen; 05698 } 05699 } 05700 05701 return INT2NUM(i); 05702 } 05703 05704 static const char isspacetable[256] = { 05705 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 05706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05707 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05709 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05710 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05711 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05712 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05713 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05715 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05718 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05719 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 05721 }; 05722 05723 #define ascii_isspace(c) isspacetable[(unsigned char)(c)] 05724 05725 /* 05726 * call-seq: 05727 * str.split(pattern=$;, [limit]) -> anArray 05728 * 05729 * Divides <i>str</i> into substrings based on a delimiter, returning an array 05730 * of these substrings. 05731 * 05732 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 05733 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 05734 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 05735 * of contiguous whitespace characters ignored. 05736 * 05737 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 05738 * pattern matches. Whenever the pattern matches a zero-length string, 05739 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 05740 * groups, the respective matches will be returned in the array as well. 05741 * 05742 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 05743 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 05744 * split on whitespace as if ` ' were specified. 05745 * 05746 * If the <i>limit</i> parameter is omitted, trailing null fields are 05747 * suppressed. If <i>limit</i> is a positive number, at most that number of 05748 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 05749 * string is returned as the only entry in an array). If negative, there is no 05750 * limit to the number of fields returned, and trailing null fields are not 05751 * suppressed. 05752 * 05753 * " now's the time".split #=> ["now's", "the", "time"] 05754 * " now's the time".split(' ') #=> ["now's", "the", "time"] 05755 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 05756 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 05757 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 05758 * "hello".split(//, 3) #=> ["h", "e", "llo"] 05759 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 05760 * 05761 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 05762 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 05763 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 05764 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 05765 */ 05766 05767 static VALUE 05768 rb_str_split_m(int argc, VALUE *argv, VALUE str) 05769 { 05770 rb_encoding *enc; 05771 VALUE spat; 05772 VALUE limit; 05773 enum {awk, string, regexp} split_type; 05774 long beg, end, i = 0; 05775 int lim = 0; 05776 VALUE result, tmp; 05777 05778 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 05779 lim = NUM2INT(limit); 05780 if (lim <= 0) limit = Qnil; 05781 else if (lim == 1) { 05782 if (RSTRING_LEN(str) == 0) 05783 return rb_ary_new2(0); 05784 return rb_ary_new3(1, str); 05785 } 05786 i = 1; 05787 } 05788 05789 enc = STR_ENC_GET(str); 05790 if (NIL_P(spat)) { 05791 if (!NIL_P(rb_fs)) { 05792 spat = rb_fs; 05793 goto fs_set; 05794 } 05795 split_type = awk; 05796 } 05797 else { 05798 fs_set: 05799 if (TYPE(spat) == T_STRING) { 05800 rb_encoding *enc2 = STR_ENC_GET(spat); 05801 05802 split_type = string; 05803 if (RSTRING_LEN(spat) == 0) { 05804 /* Special case - split into chars */ 05805 spat = rb_reg_regcomp(spat); 05806 split_type = regexp; 05807 } 05808 else if (rb_enc_asciicompat(enc2) == 1) { 05809 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 05810 split_type = awk; 05811 } 05812 } 05813 else { 05814 int l; 05815 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 05816 RSTRING_LEN(spat) == l) { 05817 split_type = awk; 05818 } 05819 } 05820 } 05821 else { 05822 spat = get_pat(spat, 1); 05823 split_type = regexp; 05824 } 05825 } 05826 05827 result = rb_ary_new(); 05828 beg = 0; 05829 if (split_type == awk) { 05830 char *ptr = RSTRING_PTR(str); 05831 char *eptr = RSTRING_END(str); 05832 char *bptr = ptr; 05833 int skip = 1; 05834 unsigned int c; 05835 05836 end = beg; 05837 if (is_ascii_string(str)) { 05838 while (ptr < eptr) { 05839 c = (unsigned char)*ptr++; 05840 if (skip) { 05841 if (ascii_isspace(c)) { 05842 beg = ptr - bptr; 05843 } 05844 else { 05845 end = ptr - bptr; 05846 skip = 0; 05847 if (!NIL_P(limit) && lim <= i) break; 05848 } 05849 } 05850 else if (ascii_isspace(c)) { 05851 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05852 skip = 1; 05853 beg = ptr - bptr; 05854 if (!NIL_P(limit)) ++i; 05855 } 05856 else { 05857 end = ptr - bptr; 05858 } 05859 } 05860 } 05861 else { 05862 while (ptr < eptr) { 05863 int n; 05864 05865 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 05866 ptr += n; 05867 if (skip) { 05868 if (rb_isspace(c)) { 05869 beg = ptr - bptr; 05870 } 05871 else { 05872 end = ptr - bptr; 05873 skip = 0; 05874 if (!NIL_P(limit) && lim <= i) break; 05875 } 05876 } 05877 else if (rb_isspace(c)) { 05878 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05879 skip = 1; 05880 beg = ptr - bptr; 05881 if (!NIL_P(limit)) ++i; 05882 } 05883 else { 05884 end = ptr - bptr; 05885 } 05886 } 05887 } 05888 } 05889 else if (split_type == string) { 05890 char *ptr = RSTRING_PTR(str); 05891 char *temp = ptr; 05892 char *eptr = RSTRING_END(str); 05893 char *sptr = RSTRING_PTR(spat); 05894 long slen = RSTRING_LEN(spat); 05895 05896 if (is_broken_string(str)) { 05897 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 05898 } 05899 if (is_broken_string(spat)) { 05900 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 05901 } 05902 enc = rb_enc_check(str, spat); 05903 while (ptr < eptr && 05904 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 05905 /* Check we are at the start of a char */ 05906 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 05907 if (t != ptr + end) { 05908 ptr = t; 05909 continue; 05910 } 05911 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 05912 ptr += end + slen; 05913 if (!NIL_P(limit) && lim <= ++i) break; 05914 } 05915 beg = ptr - temp; 05916 } 05917 else { 05918 char *ptr = RSTRING_PTR(str); 05919 long len = RSTRING_LEN(str); 05920 long start = beg; 05921 long idx; 05922 int last_null = 0; 05923 struct re_registers *regs; 05924 05925 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 05926 regs = RMATCH_REGS(rb_backref_get()); 05927 if (start == end && BEG(0) == END(0)) { 05928 if (!ptr) { 05929 rb_ary_push(result, str_new_empty(str)); 05930 break; 05931 } 05932 else if (last_null == 1) { 05933 rb_ary_push(result, rb_str_subseq(str, beg, 05934 rb_enc_fast_mbclen(ptr+beg, 05935 ptr+len, 05936 enc))); 05937 beg = start; 05938 } 05939 else { 05940 if (ptr+start == ptr+len) 05941 start++; 05942 else 05943 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 05944 last_null = 1; 05945 continue; 05946 } 05947 } 05948 else { 05949 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05950 beg = start = END(0); 05951 } 05952 last_null = 0; 05953 05954 for (idx=1; idx < regs->num_regs; idx++) { 05955 if (BEG(idx) == -1) continue; 05956 if (BEG(idx) == END(idx)) 05957 tmp = str_new_empty(str); 05958 else 05959 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 05960 rb_ary_push(result, tmp); 05961 } 05962 if (!NIL_P(limit) && lim <= ++i) break; 05963 } 05964 } 05965 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 05966 if (RSTRING_LEN(str) == beg) 05967 tmp = str_new_empty(str); 05968 else 05969 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 05970 rb_ary_push(result, tmp); 05971 } 05972 if (NIL_P(limit) && lim == 0) { 05973 long len; 05974 while ((len = RARRAY_LEN(result)) > 0 && 05975 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 05976 rb_ary_pop(result); 05977 } 05978 05979 return result; 05980 } 05981 05982 VALUE 05983 rb_str_split(VALUE str, const char *sep0) 05984 { 05985 VALUE sep; 05986 05987 StringValue(str); 05988 sep = rb_str_new2(sep0); 05989 return rb_str_split_m(1, &sep, str); 05990 } 05991 05992 05993 /* 05994 * call-seq: 05995 * str.each_line(separator=$/) {|substr| block } -> str 05996 * str.each_line(separator=$/) -> an_enumerator 05997 * 05998 * str.lines(separator=$/) {|substr| block } -> str 05999 * str.lines(separator=$/) -> an_enumerator 06000 * 06001 * Splits <i>str</i> using the supplied parameter as the record separator 06002 * (<code>$/</code> by default), passing each substring in turn to the supplied 06003 * block. If a zero-length record separator is supplied, the string is split 06004 * into paragraphs delimited by multiple successive newlines. 06005 * 06006 * If no block is given, an enumerator is returned instead. 06007 * 06008 * print "Example one\n" 06009 * "hello\nworld".each_line {|s| p s} 06010 * print "Example two\n" 06011 * "hello\nworld".each_line('l') {|s| p s} 06012 * print "Example three\n" 06013 * "hello\n\n\nworld".each_line('') {|s| p s} 06014 * 06015 * <em>produces:</em> 06016 * 06017 * Example one 06018 * "hello\n" 06019 * "world" 06020 * Example two 06021 * "hel" 06022 * "l" 06023 * "o\nworl" 06024 * "d" 06025 * Example three 06026 * "hello\n\n\n" 06027 * "world" 06028 */ 06029 06030 static VALUE 06031 rb_str_each_line(int argc, VALUE *argv, VALUE str) 06032 { 06033 rb_encoding *enc; 06034 VALUE rs; 06035 unsigned int newline; 06036 const char *p, *pend, *s, *ptr; 06037 long len, rslen; 06038 VALUE line; 06039 int n; 06040 VALUE orig = str; 06041 06042 if (argc == 0) { 06043 rs = rb_rs; 06044 } 06045 else { 06046 rb_scan_args(argc, argv, "01", &rs); 06047 } 06048 RETURN_ENUMERATOR(str, argc, argv); 06049 if (NIL_P(rs)) { 06050 rb_yield(str); 06051 return orig; 06052 } 06053 str = rb_str_new4(str); 06054 ptr = p = s = RSTRING_PTR(str); 06055 pend = p + RSTRING_LEN(str); 06056 len = RSTRING_LEN(str); 06057 StringValue(rs); 06058 if (rs == rb_default_rs) { 06059 enc = rb_enc_get(str); 06060 while (p < pend) { 06061 char *p0; 06062 06063 p = memchr(p, '\n', pend - p); 06064 if (!p) break; 06065 p0 = rb_enc_left_char_head(s, p, pend, enc); 06066 if (!rb_enc_is_newline(p0, pend, enc)) { 06067 p++; 06068 continue; 06069 } 06070 p = p0 + rb_enc_mbclen(p0, pend, enc); 06071 line = rb_str_new5(str, s, p - s); 06072 OBJ_INFECT(line, str); 06073 rb_enc_cr_str_copy_for_substr(line, str); 06074 rb_yield(line); 06075 str_mod_check(str, ptr, len); 06076 s = p; 06077 } 06078 goto finish; 06079 } 06080 06081 enc = rb_enc_check(str, rs); 06082 rslen = RSTRING_LEN(rs); 06083 if (rslen == 0) { 06084 newline = '\n'; 06085 } 06086 else { 06087 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 06088 } 06089 06090 while (p < pend) { 06091 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 06092 06093 again: 06094 if (rslen == 0 && c == newline) { 06095 p += n; 06096 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 06097 goto again; 06098 } 06099 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 06100 p += n; 06101 } 06102 p -= n; 06103 } 06104 if (c == newline && 06105 (rslen <= 1 || 06106 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 06107 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); 06108 OBJ_INFECT(line, str); 06109 rb_enc_cr_str_copy_for_substr(line, str); 06110 rb_yield(line); 06111 str_mod_check(str, ptr, len); 06112 s = p + (rslen ? rslen : n); 06113 } 06114 p += n; 06115 } 06116 06117 finish: 06118 if (s != pend) { 06119 line = rb_str_new5(str, s, pend - s); 06120 OBJ_INFECT(line, str); 06121 rb_enc_cr_str_copy_for_substr(line, str); 06122 rb_yield(line); 06123 } 06124 06125 return orig; 06126 } 06127 06128 06129 /* 06130 * call-seq: 06131 * str.bytes {|fixnum| block } -> str 06132 * str.bytes -> an_enumerator 06133 * 06134 * str.each_byte {|fixnum| block } -> str 06135 * str.each_byte -> an_enumerator 06136 * 06137 * Passes each byte in <i>str</i> to the given block, or returns 06138 * an enumerator if no block is given. 06139 * 06140 * "hello".each_byte {|c| print c, ' ' } 06141 * 06142 * <em>produces:</em> 06143 * 06144 * 104 101 108 108 111 06145 */ 06146 06147 static VALUE 06148 rb_str_each_byte(VALUE str) 06149 { 06150 long i; 06151 06152 RETURN_ENUMERATOR(str, 0, 0); 06153 for (i=0; i<RSTRING_LEN(str); i++) { 06154 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06155 } 06156 return str; 06157 } 06158 06159 06160 /* 06161 * call-seq: 06162 * str.chars {|cstr| block } -> str 06163 * str.chars -> an_enumerator 06164 * 06165 * str.each_char {|cstr| block } -> str 06166 * str.each_char -> an_enumerator 06167 * 06168 * Passes each character in <i>str</i> to the given block, or returns 06169 * an enumerator if no block is given. 06170 * 06171 * "hello".each_char {|c| print c, ' ' } 06172 * 06173 * <em>produces:</em> 06174 * 06175 * h e l l o 06176 */ 06177 06178 static VALUE 06179 rb_str_each_char(VALUE str) 06180 { 06181 VALUE orig = str; 06182 long i, len, n; 06183 const char *ptr; 06184 rb_encoding *enc; 06185 06186 RETURN_ENUMERATOR(str, 0, 0); 06187 str = rb_str_new4(str); 06188 ptr = RSTRING_PTR(str); 06189 len = RSTRING_LEN(str); 06190 enc = rb_enc_get(str); 06191 switch (ENC_CODERANGE(str)) { 06192 case ENC_CODERANGE_VALID: 06193 case ENC_CODERANGE_7BIT: 06194 for (i = 0; i < len; i += n) { 06195 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 06196 rb_yield(rb_str_subseq(str, i, n)); 06197 } 06198 break; 06199 default: 06200 for (i = 0; i < len; i += n) { 06201 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 06202 rb_yield(rb_str_subseq(str, i, n)); 06203 } 06204 } 06205 return orig; 06206 } 06207 06208 /* 06209 * call-seq: 06210 * str.codepoints {|integer| block } -> str 06211 * str.codepoints -> an_enumerator 06212 * 06213 * str.each_codepoint {|integer| block } -> str 06214 * str.each_codepoint -> an_enumerator 06215 * 06216 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 06217 * also known as a <i>codepoint</i> when applied to Unicode strings to the 06218 * given block. 06219 * 06220 * If no block is given, an enumerator is returned instead. 06221 * 06222 * "hello\u0639".each_codepoint {|c| print c, ' ' } 06223 * 06224 * <em>produces:</em> 06225 * 06226 * 104 101 108 108 111 1593 06227 */ 06228 06229 static VALUE 06230 rb_str_each_codepoint(VALUE str) 06231 { 06232 VALUE orig = str; 06233 int n; 06234 unsigned int c; 06235 const char *ptr, *end; 06236 rb_encoding *enc; 06237 06238 if (single_byte_optimizable(str)) return rb_str_each_byte(str); 06239 RETURN_ENUMERATOR(str, 0, 0); 06240 str = rb_str_new4(str); 06241 ptr = RSTRING_PTR(str); 06242 end = RSTRING_END(str); 06243 enc = STR_ENC_GET(str); 06244 while (ptr < end) { 06245 c = rb_enc_codepoint_len(ptr, end, &n, enc); 06246 rb_yield(UINT2NUM(c)); 06247 ptr += n; 06248 } 06249 return orig; 06250 } 06251 06252 static long 06253 chopped_length(VALUE str) 06254 { 06255 rb_encoding *enc = STR_ENC_GET(str); 06256 const char *p, *p2, *beg, *end; 06257 06258 beg = RSTRING_PTR(str); 06259 end = beg + RSTRING_LEN(str); 06260 if (beg > end) return 0; 06261 p = rb_enc_prev_char(beg, end, end, enc); 06262 if (!p) return 0; 06263 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 06264 p2 = rb_enc_prev_char(beg, p, end, enc); 06265 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 06266 } 06267 return p - beg; 06268 } 06269 06270 /* 06271 * call-seq: 06272 * str.chop! -> str or nil 06273 * 06274 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 06275 * or <code>nil</code> if <i>str</i> is the empty string. See also 06276 * <code>String#chomp!</code>. 06277 */ 06278 06279 static VALUE 06280 rb_str_chop_bang(VALUE str) 06281 { 06282 str_modify_keep_cr(str); 06283 if (RSTRING_LEN(str) > 0) { 06284 long len; 06285 len = chopped_length(str); 06286 STR_SET_LEN(str, len); 06287 RSTRING_PTR(str)[len] = '\0'; 06288 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06289 ENC_CODERANGE_CLEAR(str); 06290 } 06291 return str; 06292 } 06293 return Qnil; 06294 } 06295 06296 06297 /* 06298 * call-seq: 06299 * str.chop -> new_str 06300 * 06301 * Returns a new <code>String</code> with the last character removed. If the 06302 * string ends with <code>\r\n</code>, both characters are removed. Applying 06303 * <code>chop</code> to an empty string returns an empty 06304 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 06305 * the string unchanged if it doesn't end in a record separator. 06306 * 06307 * "string\r\n".chop #=> "string" 06308 * "string\n\r".chop #=> "string\n" 06309 * "string\n".chop #=> "string" 06310 * "string".chop #=> "strin" 06311 * "x".chop.chop #=> "" 06312 */ 06313 06314 static VALUE 06315 rb_str_chop(VALUE str) 06316 { 06317 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str)); 06318 rb_enc_cr_str_copy_for_substr(str2, str); 06319 OBJ_INFECT(str2, str); 06320 return str2; 06321 } 06322 06323 06324 /* 06325 * call-seq: 06326 * str.chomp!(separator=$/) -> str or nil 06327 * 06328 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 06329 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 06330 */ 06331 06332 static VALUE 06333 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 06334 { 06335 rb_encoding *enc; 06336 VALUE rs; 06337 int newline; 06338 char *p, *pp, *e; 06339 long len, rslen; 06340 06341 str_modify_keep_cr(str); 06342 len = RSTRING_LEN(str); 06343 if (len == 0) return Qnil; 06344 p = RSTRING_PTR(str); 06345 e = p + len; 06346 if (argc == 0) { 06347 rs = rb_rs; 06348 if (rs == rb_default_rs) { 06349 smart_chomp: 06350 enc = rb_enc_get(str); 06351 if (rb_enc_mbminlen(enc) > 1) { 06352 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 06353 if (rb_enc_is_newline(pp, e, enc)) { 06354 e = pp; 06355 } 06356 pp = e - rb_enc_mbminlen(enc); 06357 if (pp >= p) { 06358 pp = rb_enc_left_char_head(p, pp, e, enc); 06359 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 06360 e = pp; 06361 } 06362 } 06363 if (e == RSTRING_END(str)) { 06364 return Qnil; 06365 } 06366 len = e - RSTRING_PTR(str); 06367 STR_SET_LEN(str, len); 06368 } 06369 else { 06370 if (RSTRING_PTR(str)[len-1] == '\n') { 06371 STR_DEC_LEN(str); 06372 if (RSTRING_LEN(str) > 0 && 06373 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 06374 STR_DEC_LEN(str); 06375 } 06376 } 06377 else if (RSTRING_PTR(str)[len-1] == '\r') { 06378 STR_DEC_LEN(str); 06379 } 06380 else { 06381 return Qnil; 06382 } 06383 } 06384 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06385 return str; 06386 } 06387 } 06388 else { 06389 rb_scan_args(argc, argv, "01", &rs); 06390 } 06391 if (NIL_P(rs)) return Qnil; 06392 StringValue(rs); 06393 rslen = RSTRING_LEN(rs); 06394 if (rslen == 0) { 06395 while (len>0 && p[len-1] == '\n') { 06396 len--; 06397 if (len>0 && p[len-1] == '\r') 06398 len--; 06399 } 06400 if (len < RSTRING_LEN(str)) { 06401 STR_SET_LEN(str, len); 06402 RSTRING_PTR(str)[len] = '\0'; 06403 return str; 06404 } 06405 return Qnil; 06406 } 06407 if (rslen > len) return Qnil; 06408 newline = RSTRING_PTR(rs)[rslen-1]; 06409 if (rslen == 1 && newline == '\n') 06410 goto smart_chomp; 06411 06412 enc = rb_enc_check(str, rs); 06413 if (is_broken_string(rs)) { 06414 return Qnil; 06415 } 06416 pp = e - rslen; 06417 if (p[len-1] == newline && 06418 (rslen <= 1 || 06419 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 06420 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 06421 return Qnil; 06422 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06423 ENC_CODERANGE_CLEAR(str); 06424 } 06425 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 06426 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06427 return str; 06428 } 06429 return Qnil; 06430 } 06431 06432 06433 /* 06434 * call-seq: 06435 * str.chomp(separator=$/) -> new_str 06436 * 06437 * Returns a new <code>String</code> with the given record separator removed 06438 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 06439 * changed from the default Ruby record separator, then <code>chomp</code> also 06440 * removes carriage return characters (that is it will remove <code>\n</code>, 06441 * <code>\r</code>, and <code>\r\n</code>). 06442 * 06443 * "hello".chomp #=> "hello" 06444 * "hello\n".chomp #=> "hello" 06445 * "hello\r\n".chomp #=> "hello" 06446 * "hello\n\r".chomp #=> "hello\n" 06447 * "hello\r".chomp #=> "hello" 06448 * "hello \n there".chomp #=> "hello \n there" 06449 * "hello".chomp("llo") #=> "he" 06450 */ 06451 06452 static VALUE 06453 rb_str_chomp(int argc, VALUE *argv, VALUE str) 06454 { 06455 str = rb_str_dup(str); 06456 rb_str_chomp_bang(argc, argv, str); 06457 return str; 06458 } 06459 06460 /* 06461 * call-seq: 06462 * str.lstrip! -> self or nil 06463 * 06464 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 06465 * change was made. See also <code>String#rstrip!</code> and 06466 * <code>String#strip!</code>. 06467 * 06468 * " hello ".lstrip #=> "hello " 06469 * "hello".lstrip! #=> nil 06470 */ 06471 06472 static VALUE 06473 rb_str_lstrip_bang(VALUE str) 06474 { 06475 rb_encoding *enc; 06476 char *s, *t, *e; 06477 06478 str_modify_keep_cr(str); 06479 enc = STR_ENC_GET(str); 06480 s = RSTRING_PTR(str); 06481 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06482 e = t = RSTRING_END(str); 06483 /* remove spaces at head */ 06484 while (s < e) { 06485 int n; 06486 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 06487 06488 if (!rb_isspace(cc)) break; 06489 s += n; 06490 } 06491 06492 if (s > RSTRING_PTR(str)) { 06493 STR_SET_LEN(str, t-s); 06494 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 06495 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06496 return str; 06497 } 06498 return Qnil; 06499 } 06500 06501 06502 /* 06503 * call-seq: 06504 * str.lstrip -> new_str 06505 * 06506 * Returns a copy of <i>str</i> with leading whitespace removed. See also 06507 * <code>String#rstrip</code> and <code>String#strip</code>. 06508 * 06509 * " hello ".lstrip #=> "hello " 06510 * "hello".lstrip #=> "hello" 06511 */ 06512 06513 static VALUE 06514 rb_str_lstrip(VALUE str) 06515 { 06516 str = rb_str_dup(str); 06517 rb_str_lstrip_bang(str); 06518 return str; 06519 } 06520 06521 06522 /* 06523 * call-seq: 06524 * str.rstrip! -> self or nil 06525 * 06526 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 06527 * no change was made. See also <code>String#lstrip!</code> and 06528 * <code>String#strip!</code>. 06529 * 06530 * " hello ".rstrip #=> " hello" 06531 * "hello".rstrip! #=> nil 06532 */ 06533 06534 static VALUE 06535 rb_str_rstrip_bang(VALUE str) 06536 { 06537 rb_encoding *enc; 06538 char *s, *t, *e; 06539 06540 str_modify_keep_cr(str); 06541 enc = STR_ENC_GET(str); 06542 rb_str_check_dummy_enc(enc); 06543 s = RSTRING_PTR(str); 06544 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06545 t = e = RSTRING_END(str); 06546 06547 /* remove trailing spaces or '\0's */ 06548 if (single_byte_optimizable(str)) { 06549 unsigned char c; 06550 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 06551 } 06552 else { 06553 char *tp; 06554 06555 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 06556 unsigned int c = rb_enc_codepoint(tp, e, enc); 06557 if (c && !rb_isspace(c)) break; 06558 t = tp; 06559 } 06560 } 06561 if (t < e) { 06562 long len = t-RSTRING_PTR(str); 06563 06564 STR_SET_LEN(str, len); 06565 RSTRING_PTR(str)[len] = '\0'; 06566 return str; 06567 } 06568 return Qnil; 06569 } 06570 06571 06572 /* 06573 * call-seq: 06574 * str.rstrip -> new_str 06575 * 06576 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 06577 * <code>String#lstrip</code> and <code>String#strip</code>. 06578 * 06579 * " hello ".rstrip #=> " hello" 06580 * "hello".rstrip #=> "hello" 06581 */ 06582 06583 static VALUE 06584 rb_str_rstrip(VALUE str) 06585 { 06586 str = rb_str_dup(str); 06587 rb_str_rstrip_bang(str); 06588 return str; 06589 } 06590 06591 06592 /* 06593 * call-seq: 06594 * str.strip! -> str or nil 06595 * 06596 * Removes leading and trailing whitespace from <i>str</i>. Returns 06597 * <code>nil</code> if <i>str</i> was not altered. 06598 */ 06599 06600 static VALUE 06601 rb_str_strip_bang(VALUE str) 06602 { 06603 VALUE l = rb_str_lstrip_bang(str); 06604 VALUE r = rb_str_rstrip_bang(str); 06605 06606 if (NIL_P(l) && NIL_P(r)) return Qnil; 06607 return str; 06608 } 06609 06610 06611 /* 06612 * call-seq: 06613 * str.strip -> new_str 06614 * 06615 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 06616 * 06617 * " hello ".strip #=> "hello" 06618 * "\tgoodbye\r\n".strip #=> "goodbye" 06619 */ 06620 06621 static VALUE 06622 rb_str_strip(VALUE str) 06623 { 06624 str = rb_str_dup(str); 06625 rb_str_strip_bang(str); 06626 return str; 06627 } 06628 06629 static VALUE 06630 scan_once(VALUE str, VALUE pat, long *start) 06631 { 06632 VALUE result, match; 06633 struct re_registers *regs; 06634 int i; 06635 06636 if (rb_reg_search(pat, str, *start, 0) >= 0) { 06637 match = rb_backref_get(); 06638 regs = RMATCH_REGS(match); 06639 if (BEG(0) == END(0)) { 06640 rb_encoding *enc = STR_ENC_GET(str); 06641 /* 06642 * Always consume at least one character of the input string 06643 */ 06644 if (RSTRING_LEN(str) > END(0)) 06645 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 06646 RSTRING_END(str), enc); 06647 else 06648 *start = END(0)+1; 06649 } 06650 else { 06651 *start = END(0); 06652 } 06653 if (regs->num_regs == 1) { 06654 return rb_reg_nth_match(0, match); 06655 } 06656 result = rb_ary_new2(regs->num_regs); 06657 for (i=1; i < regs->num_regs; i++) { 06658 rb_ary_push(result, rb_reg_nth_match(i, match)); 06659 } 06660 06661 return result; 06662 } 06663 return Qnil; 06664 } 06665 06666 06667 /* 06668 * call-seq: 06669 * str.scan(pattern) -> array 06670 * str.scan(pattern) {|match, ...| block } -> str 06671 * 06672 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 06673 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 06674 * generated and either added to the result array or passed to the block. If 06675 * the pattern contains no groups, each individual result consists of the 06676 * matched string, <code>$&</code>. If the pattern contains groups, each 06677 * individual result is itself an array containing one entry per group. 06678 * 06679 * a = "cruel world" 06680 * a.scan(/\w+/) #=> ["cruel", "world"] 06681 * a.scan(/.../) #=> ["cru", "el ", "wor"] 06682 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 06683 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 06684 * 06685 * And the block form: 06686 * 06687 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 06688 * print "\n" 06689 * a.scan(/(.)(.)/) {|x,y| print y, x } 06690 * print "\n" 06691 * 06692 * <em>produces:</em> 06693 * 06694 * <<cruel>> <<world>> 06695 * rceu lowlr 06696 */ 06697 06698 static VALUE 06699 rb_str_scan(VALUE str, VALUE pat) 06700 { 06701 VALUE result; 06702 long start = 0; 06703 long last = -1, prev = 0; 06704 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 06705 06706 pat = get_pat(pat, 1); 06707 if (!rb_block_given_p()) { 06708 VALUE ary = rb_ary_new(); 06709 06710 while (!NIL_P(result = scan_once(str, pat, &start))) { 06711 last = prev; 06712 prev = start; 06713 rb_ary_push(ary, result); 06714 } 06715 if (last >= 0) rb_reg_search(pat, str, last, 0); 06716 return ary; 06717 } 06718 06719 while (!NIL_P(result = scan_once(str, pat, &start))) { 06720 last = prev; 06721 prev = start; 06722 rb_yield(result); 06723 str_mod_check(str, p, len); 06724 } 06725 if (last >= 0) rb_reg_search(pat, str, last, 0); 06726 return str; 06727 } 06728 06729 06730 /* 06731 * call-seq: 06732 * str.hex -> integer 06733 * 06734 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 06735 * (with an optional sign and an optional <code>0x</code>) and returns the 06736 * corresponding number. Zero is returned on error. 06737 * 06738 * "0x0a".hex #=> 10 06739 * "-1234".hex #=> -4660 06740 * "0".hex #=> 0 06741 * "wombat".hex #=> 0 06742 */ 06743 06744 static VALUE 06745 rb_str_hex(VALUE str) 06746 { 06747 rb_encoding *enc = rb_enc_get(str); 06748 06749 if (!rb_enc_asciicompat(enc)) { 06750 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06751 } 06752 return rb_str_to_inum(str, 16, FALSE); 06753 } 06754 06755 06756 /* 06757 * call-seq: 06758 * str.oct -> integer 06759 * 06760 * Treats leading characters of <i>str</i> as a string of octal digits (with an 06761 * optional sign) and returns the corresponding number. Returns 0 if the 06762 * conversion fails. 06763 * 06764 * "123".oct #=> 83 06765 * "-377".oct #=> -255 06766 * "bad".oct #=> 0 06767 * "0377bad".oct #=> 255 06768 */ 06769 06770 static VALUE 06771 rb_str_oct(VALUE str) 06772 { 06773 rb_encoding *enc = rb_enc_get(str); 06774 06775 if (!rb_enc_asciicompat(enc)) { 06776 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06777 } 06778 return rb_str_to_inum(str, -8, FALSE); 06779 } 06780 06781 06782 /* 06783 * call-seq: 06784 * str.crypt(other_str) -> new_str 06785 * 06786 * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard 06787 * library function <code>crypt</code>. The argument is the salt string, which 06788 * should be two characters long, each character drawn from 06789 * <code>[a-zA-Z0-9./]</code>. 06790 */ 06791 06792 static VALUE 06793 rb_str_crypt(VALUE str, VALUE salt) 06794 { 06795 extern char *crypt(const char *, const char *); 06796 VALUE result; 06797 const char *s, *saltp; 06798 char *res; 06799 #ifdef BROKEN_CRYPT 06800 char salt_8bit_clean[3]; 06801 #endif 06802 06803 StringValue(salt); 06804 if (RSTRING_LEN(salt) < 2) 06805 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 06806 06807 s = RSTRING_PTR(str); 06808 if (!s) s = ""; 06809 saltp = RSTRING_PTR(salt); 06810 #ifdef BROKEN_CRYPT 06811 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 06812 salt_8bit_clean[0] = saltp[0] & 0x7f; 06813 salt_8bit_clean[1] = saltp[1] & 0x7f; 06814 salt_8bit_clean[2] = '\0'; 06815 saltp = salt_8bit_clean; 06816 } 06817 #endif 06818 res = crypt(s, saltp); 06819 if (!res) { 06820 rb_sys_fail("crypt"); 06821 } 06822 result = rb_str_new2(res); 06823 OBJ_INFECT(result, str); 06824 OBJ_INFECT(result, salt); 06825 return result; 06826 } 06827 06828 06829 /* 06830 * call-seq: 06831 * str.intern -> symbol 06832 * str.to_sym -> symbol 06833 * 06834 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 06835 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 06836 * 06837 * "Koala".intern #=> :Koala 06838 * s = 'cat'.to_sym #=> :cat 06839 * s == :cat #=> true 06840 * s = '@cat'.to_sym #=> :@cat 06841 * s == :@cat #=> true 06842 * 06843 * This can also be used to create symbols that cannot be represented using the 06844 * <code>:xxx</code> notation. 06845 * 06846 * 'cat and dog'.to_sym #=> :"cat and dog" 06847 */ 06848 06849 VALUE 06850 rb_str_intern(VALUE s) 06851 { 06852 VALUE str = RB_GC_GUARD(s); 06853 ID id; 06854 06855 id = rb_intern_str(str); 06856 return ID2SYM(id); 06857 } 06858 06859 06860 /* 06861 * call-seq: 06862 * str.ord -> integer 06863 * 06864 * Return the <code>Integer</code> ordinal of a one-character string. 06865 * 06866 * "a".ord #=> 97 06867 */ 06868 06869 VALUE 06870 rb_str_ord(VALUE s) 06871 { 06872 unsigned int c; 06873 06874 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 06875 return UINT2NUM(c); 06876 } 06877 /* 06878 * call-seq: 06879 * str.sum(n=16) -> integer 06880 * 06881 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 06882 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 06883 * to 16. The result is simply the sum of the binary value of each character in 06884 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 06885 * checksum. 06886 */ 06887 06888 static VALUE 06889 rb_str_sum(int argc, VALUE *argv, VALUE str) 06890 { 06891 VALUE vbits; 06892 int bits; 06893 char *ptr, *p, *pend; 06894 long len; 06895 VALUE sum = INT2FIX(0); 06896 unsigned long sum0 = 0; 06897 06898 if (argc == 0) { 06899 bits = 16; 06900 } 06901 else { 06902 rb_scan_args(argc, argv, "01", &vbits); 06903 bits = NUM2INT(vbits); 06904 } 06905 ptr = p = RSTRING_PTR(str); 06906 len = RSTRING_LEN(str); 06907 pend = p + len; 06908 06909 while (p < pend) { 06910 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 06911 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06912 str_mod_check(str, ptr, len); 06913 sum0 = 0; 06914 } 06915 sum0 += (unsigned char)*p; 06916 p++; 06917 } 06918 06919 if (bits == 0) { 06920 if (sum0) { 06921 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06922 } 06923 } 06924 else { 06925 if (sum == INT2FIX(0)) { 06926 if (bits < (int)sizeof(long)*CHAR_BIT) { 06927 sum0 &= (((unsigned long)1)<<bits)-1; 06928 } 06929 sum = LONG2FIX(sum0); 06930 } 06931 else { 06932 VALUE mod; 06933 06934 if (sum0) { 06935 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06936 } 06937 06938 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 06939 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 06940 sum = rb_funcall(sum, '&', 1, mod); 06941 } 06942 } 06943 return sum; 06944 } 06945 06946 static VALUE 06947 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 06948 { 06949 rb_encoding *enc; 06950 VALUE w; 06951 long width, len, flen = 1, fclen = 1; 06952 VALUE res; 06953 char *p; 06954 const char *f = " "; 06955 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 06956 volatile VALUE pad; 06957 int singlebyte = 1, cr; 06958 06959 rb_scan_args(argc, argv, "11", &w, &pad); 06960 enc = STR_ENC_GET(str); 06961 width = NUM2LONG(w); 06962 if (argc == 2) { 06963 StringValue(pad); 06964 enc = rb_enc_check(str, pad); 06965 f = RSTRING_PTR(pad); 06966 flen = RSTRING_LEN(pad); 06967 fclen = str_strlen(pad, enc); 06968 singlebyte = single_byte_optimizable(pad); 06969 if (flen == 0 || fclen == 0) { 06970 rb_raise(rb_eArgError, "zero width padding"); 06971 } 06972 } 06973 len = str_strlen(str, enc); 06974 if (width < 0 || len >= width) return rb_str_dup(str); 06975 n = width - len; 06976 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 06977 rlen = n - llen; 06978 cr = ENC_CODERANGE(str); 06979 if (flen > 1) { 06980 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 06981 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 06982 } 06983 size = RSTRING_LEN(str); 06984 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 06985 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 06986 (len += llen2 + rlen2) >= LONG_MAX - size) { 06987 rb_raise(rb_eArgError, "argument too big"); 06988 } 06989 len += size; 06990 res = rb_str_new5(str, 0, len); 06991 p = RSTRING_PTR(res); 06992 if (flen <= 1) { 06993 memset(p, *f, llen); 06994 p += llen; 06995 } 06996 else { 06997 while (llen >= fclen) { 06998 memcpy(p,f,flen); 06999 p += flen; 07000 llen -= fclen; 07001 } 07002 if (llen > 0) { 07003 memcpy(p, f, llen2); 07004 p += llen2; 07005 } 07006 } 07007 memcpy(p, RSTRING_PTR(str), size); 07008 p += size; 07009 if (flen <= 1) { 07010 memset(p, *f, rlen); 07011 p += rlen; 07012 } 07013 else { 07014 while (rlen >= fclen) { 07015 memcpy(p,f,flen); 07016 p += flen; 07017 rlen -= fclen; 07018 } 07019 if (rlen > 0) { 07020 memcpy(p, f, rlen2); 07021 p += rlen2; 07022 } 07023 } 07024 *p = '\0'; 07025 STR_SET_LEN(res, p-RSTRING_PTR(res)); 07026 OBJ_INFECT(res, str); 07027 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 07028 rb_enc_associate(res, enc); 07029 if (argc == 2) 07030 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 07031 if (cr != ENC_CODERANGE_BROKEN) 07032 ENC_CODERANGE_SET(res, cr); 07033 return res; 07034 } 07035 07036 07037 /* 07038 * call-seq: 07039 * str.ljust(integer, padstr=' ') -> new_str 07040 * 07041 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07042 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 07043 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07044 * 07045 * "hello".ljust(4) #=> "hello" 07046 * "hello".ljust(20) #=> "hello " 07047 * "hello".ljust(20, '1234') #=> "hello123412341234123" 07048 */ 07049 07050 static VALUE 07051 rb_str_ljust(int argc, VALUE *argv, VALUE str) 07052 { 07053 return rb_str_justify(argc, argv, str, 'l'); 07054 } 07055 07056 07057 /* 07058 * call-seq: 07059 * str.rjust(integer, padstr=' ') -> new_str 07060 * 07061 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07062 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 07063 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07064 * 07065 * "hello".rjust(4) #=> "hello" 07066 * "hello".rjust(20) #=> " hello" 07067 * "hello".rjust(20, '1234') #=> "123412341234123hello" 07068 */ 07069 07070 static VALUE 07071 rb_str_rjust(int argc, VALUE *argv, VALUE str) 07072 { 07073 return rb_str_justify(argc, argv, str, 'r'); 07074 } 07075 07076 07077 /* 07078 * call-seq: 07079 * str.center(integer, padstr) -> new_str 07080 * 07081 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07082 * <code>String</code> of length <i>integer</i> with <i>str</i> centered and 07083 * padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07084 * 07085 * "hello".center(4) #=> "hello" 07086 * "hello".center(20) #=> " hello " 07087 * "hello".center(20, '123') #=> "1231231hello12312312" 07088 */ 07089 07090 static VALUE 07091 rb_str_center(int argc, VALUE *argv, VALUE str) 07092 { 07093 return rb_str_justify(argc, argv, str, 'c'); 07094 } 07095 07096 /* 07097 * call-seq: 07098 * str.partition(sep) -> [head, sep, tail] 07099 * str.partition(regexp) -> [head, match, tail] 07100 * 07101 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 07102 * and returns the part before it, the match, and the part 07103 * after it. 07104 * If it is not found, returns two empty strings and <i>str</i>. 07105 * 07106 * "hello".partition("l") #=> ["he", "l", "lo"] 07107 * "hello".partition("x") #=> ["hello", "", ""] 07108 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 07109 */ 07110 07111 static VALUE 07112 rb_str_partition(VALUE str, VALUE sep) 07113 { 07114 long pos; 07115 int regex = FALSE; 07116 07117 if (TYPE(sep) == T_REGEXP) { 07118 pos = rb_reg_search(sep, str, 0, 0); 07119 regex = TRUE; 07120 } 07121 else { 07122 VALUE tmp; 07123 07124 tmp = rb_check_string_type(sep); 07125 if (NIL_P(tmp)) { 07126 rb_raise(rb_eTypeError, "type mismatch: %s given", 07127 rb_obj_classname(sep)); 07128 } 07129 sep = tmp; 07130 pos = rb_str_index(str, sep, 0); 07131 } 07132 if (pos < 0) { 07133 failed: 07134 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 07135 } 07136 if (regex) { 07137 sep = rb_str_subpat(str, sep, INT2FIX(0)); 07138 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 07139 } 07140 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 07141 sep, 07142 rb_str_subseq(str, pos+RSTRING_LEN(sep), 07143 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 07144 } 07145 07146 /* 07147 * call-seq: 07148 * str.rpartition(sep) -> [head, sep, tail] 07149 * str.rpartition(regexp) -> [head, match, tail] 07150 * 07151 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 07152 * of the string, and returns the part before it, the match, and the part 07153 * after it. 07154 * If it is not found, returns two empty strings and <i>str</i>. 07155 * 07156 * "hello".rpartition("l") #=> ["hel", "l", "o"] 07157 * "hello".rpartition("x") #=> ["", "", "hello"] 07158 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 07159 */ 07160 07161 static VALUE 07162 rb_str_rpartition(VALUE str, VALUE sep) 07163 { 07164 long pos = RSTRING_LEN(str); 07165 int regex = FALSE; 07166 07167 if (TYPE(sep) == T_REGEXP) { 07168 pos = rb_reg_search(sep, str, pos, 1); 07169 regex = TRUE; 07170 } 07171 else { 07172 VALUE tmp; 07173 07174 tmp = rb_check_string_type(sep); 07175 if (NIL_P(tmp)) { 07176 rb_raise(rb_eTypeError, "type mismatch: %s given", 07177 rb_obj_classname(sep)); 07178 } 07179 sep = tmp; 07180 pos = rb_str_sublen(str, pos); 07181 pos = rb_str_rindex(str, sep, pos); 07182 } 07183 if (pos < 0) { 07184 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 07185 } 07186 if (regex) { 07187 sep = rb_reg_nth_match(0, rb_backref_get()); 07188 } 07189 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 07190 sep, 07191 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 07192 } 07193 07194 /* 07195 * call-seq: 07196 * str.start_with?([prefix]+) -> true or false 07197 * 07198 * Returns true if <i>str</i> starts with one of the prefixes given. 07199 * 07200 * p "hello".start_with?("hell") #=> true 07201 * 07202 * # returns true if one of the prefixes matches. 07203 * p "hello".start_with?("heaven", "hell") #=> true 07204 * p "hello".start_with?("heaven", "paradise") #=> false 07205 * 07206 * 07207 * 07208 */ 07209 07210 static VALUE 07211 rb_str_start_with(int argc, VALUE *argv, VALUE str) 07212 { 07213 int i; 07214 07215 for (i=0; i<argc; i++) { 07216 VALUE tmp = rb_check_string_type(argv[i]); 07217 if (NIL_P(tmp)) continue; 07218 rb_enc_check(str, tmp); 07219 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07220 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07221 return Qtrue; 07222 } 07223 return Qfalse; 07224 } 07225 07226 /* 07227 * call-seq: 07228 * str.end_with?([suffix]+) -> true or false 07229 * 07230 * Returns true if <i>str</i> ends with one of the suffixes given. 07231 */ 07232 07233 static VALUE 07234 rb_str_end_with(int argc, VALUE *argv, VALUE str) 07235 { 07236 int i; 07237 char *p, *s, *e; 07238 rb_encoding *enc; 07239 07240 for (i=0; i<argc; i++) { 07241 VALUE tmp = rb_check_string_type(argv[i]); 07242 if (NIL_P(tmp)) continue; 07243 enc = rb_enc_check(str, tmp); 07244 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07245 p = RSTRING_PTR(str); 07246 e = p + RSTRING_LEN(str); 07247 s = e - RSTRING_LEN(tmp); 07248 if (rb_enc_left_char_head(p, s, e, enc) != s) 07249 continue; 07250 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07251 return Qtrue; 07252 } 07253 return Qfalse; 07254 } 07255 07256 void 07257 rb_str_setter(VALUE val, ID id, VALUE *var) 07258 { 07259 if (!NIL_P(val) && TYPE(val) != T_STRING) { 07260 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 07261 } 07262 *var = val; 07263 } 07264 07265 07266 /* 07267 * call-seq: 07268 * str.force_encoding(encoding) -> str 07269 * 07270 * Changes the encoding to +encoding+ and returns self. 07271 */ 07272 07273 static VALUE 07274 rb_str_force_encoding(VALUE str, VALUE enc) 07275 { 07276 str_modifiable(str); 07277 rb_enc_associate(str, rb_to_encoding(enc)); 07278 ENC_CODERANGE_CLEAR(str); 07279 return str; 07280 } 07281 07282 /* 07283 * call-seq: 07284 * str.valid_encoding? -> true or false 07285 * 07286 * Returns true for a string which encoded correctly. 07287 * 07288 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 07289 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 07290 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 07291 */ 07292 07293 static VALUE 07294 rb_str_valid_encoding_p(VALUE str) 07295 { 07296 int cr = rb_enc_str_coderange(str); 07297 07298 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 07299 } 07300 07301 /* 07302 * call-seq: 07303 * str.ascii_only? -> true or false 07304 * 07305 * Returns true for a string which has only ASCII characters. 07306 * 07307 * "abc".force_encoding("UTF-8").ascii_only? #=> true 07308 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 07309 */ 07310 07311 static VALUE 07312 rb_str_is_ascii_only_p(VALUE str) 07313 { 07314 int cr = rb_enc_str_coderange(str); 07315 07316 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 07317 } 07318 07333 VALUE 07334 rb_str_ellipsize(VALUE str, long len) 07335 { 07336 static const char ellipsis[] = "..."; 07337 const long ellipsislen = sizeof(ellipsis) - 1; 07338 rb_encoding *const enc = rb_enc_get(str); 07339 const long blen = RSTRING_LEN(str); 07340 const char *const p = RSTRING_PTR(str), *e = p + blen; 07341 VALUE estr, ret = 0; 07342 07343 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 07344 if (len * rb_enc_mbminlen(enc) >= blen || 07345 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 07346 ret = str; 07347 } 07348 else if (len <= ellipsislen || 07349 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 07350 if (rb_enc_asciicompat(enc)) { 07351 ret = rb_str_new_with_class(str, ellipsis, len); 07352 rb_enc_associate(ret, enc); 07353 } 07354 else { 07355 estr = rb_usascii_str_new(ellipsis, len); 07356 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 07357 } 07358 } 07359 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 07360 rb_str_cat(ret, ellipsis, ellipsislen); 07361 } 07362 else { 07363 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 07364 rb_enc_from_encoding(enc), 0, Qnil); 07365 rb_str_append(ret, estr); 07366 } 07367 return ret; 07368 } 07369 07370 /********************************************************************** 07371 * Document-class: Symbol 07372 * 07373 * <code>Symbol</code> objects represent names and some strings 07374 * inside the Ruby 07375 * interpreter. They are generated using the <code>:name</code> and 07376 * <code>:"string"</code> literals 07377 * syntax, and by the various <code>to_sym</code> methods. The same 07378 * <code>Symbol</code> object will be created for a given name or string 07379 * for the duration of a program's execution, regardless of the context 07380 * or meaning of that name. Thus if <code>Fred</code> is a constant in 07381 * one context, a method in another, and a class in a third, the 07382 * <code>Symbol</code> <code>:Fred</code> will be the same object in 07383 * all three contexts. 07384 * 07385 * module One 07386 * class Fred 07387 * end 07388 * $f1 = :Fred 07389 * end 07390 * module Two 07391 * Fred = 1 07392 * $f2 = :Fred 07393 * end 07394 * def Fred() 07395 * end 07396 * $f3 = :Fred 07397 * $f1.object_id #=> 2514190 07398 * $f2.object_id #=> 2514190 07399 * $f3.object_id #=> 2514190 07400 * 07401 */ 07402 07403 07404 /* 07405 * call-seq: 07406 * sym == obj -> true or false 07407 * 07408 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 07409 * symbol, returns <code>true</code>. 07410 */ 07411 07412 static VALUE 07413 sym_equal(VALUE sym1, VALUE sym2) 07414 { 07415 if (sym1 == sym2) return Qtrue; 07416 return Qfalse; 07417 } 07418 07419 07420 static int 07421 sym_printable(const char *s, const char *send, rb_encoding *enc) 07422 { 07423 while (s < send) { 07424 int n; 07425 int c = rb_enc_codepoint_len(s, send, &n, enc); 07426 07427 if (!rb_enc_isprint(c, enc)) return FALSE; 07428 s += n; 07429 } 07430 return TRUE; 07431 } 07432 07433 /* 07434 * call-seq: 07435 * sym.inspect -> string 07436 * 07437 * Returns the representation of <i>sym</i> as a symbol literal. 07438 * 07439 * :fred.inspect #=> ":fred" 07440 */ 07441 07442 static VALUE 07443 sym_inspect(VALUE sym) 07444 { 07445 VALUE str; 07446 ID id = SYM2ID(sym); 07447 rb_encoding *enc; 07448 const char *ptr; 07449 long len; 07450 char *dest; 07451 rb_encoding *resenc = rb_default_internal_encoding(); 07452 07453 if (resenc == NULL) resenc = rb_default_external_encoding(); 07454 sym = rb_id2str(id); 07455 enc = STR_ENC_GET(sym); 07456 ptr = RSTRING_PTR(sym); 07457 len = RSTRING_LEN(sym); 07458 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 07459 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 07460 str = rb_str_inspect(sym); 07461 len = RSTRING_LEN(str); 07462 rb_str_resize(str, len + 1); 07463 dest = RSTRING_PTR(str); 07464 memmove(dest + 1, dest, len); 07465 dest[0] = ':'; 07466 } 07467 else { 07468 char *dest; 07469 str = rb_enc_str_new(0, len + 1, enc); 07470 dest = RSTRING_PTR(str); 07471 dest[0] = ':'; 07472 memcpy(dest + 1, ptr, len); 07473 } 07474 return str; 07475 } 07476 07477 07478 /* 07479 * call-seq: 07480 * sym.id2name -> string 07481 * sym.to_s -> string 07482 * 07483 * Returns the name or string corresponding to <i>sym</i>. 07484 * 07485 * :fred.id2name #=> "fred" 07486 */ 07487 07488 07489 VALUE 07490 rb_sym_to_s(VALUE sym) 07491 { 07492 ID id = SYM2ID(sym); 07493 07494 return str_new3(rb_cString, rb_id2str(id)); 07495 } 07496 07497 07498 /* 07499 * call-seq: 07500 * sym.to_sym -> sym 07501 * sym.intern -> sym 07502 * 07503 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 07504 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 07505 * in this case. 07506 */ 07507 07508 static VALUE 07509 sym_to_sym(VALUE sym) 07510 { 07511 return sym; 07512 } 07513 07514 static VALUE 07515 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc) 07516 { 07517 VALUE obj; 07518 07519 if (argc < 1) { 07520 rb_raise(rb_eArgError, "no receiver given"); 07521 } 07522 obj = argv[0]; 07523 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc); 07524 } 07525 07526 /* 07527 * call-seq: 07528 * sym.to_proc 07529 * 07530 * Returns a _Proc_ object which respond to the given method by _sym_. 07531 * 07532 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 07533 */ 07534 07535 static VALUE 07536 sym_to_proc(VALUE sym) 07537 { 07538 static VALUE sym_proc_cache = Qfalse; 07539 enum {SYM_PROC_CACHE_SIZE = 67}; 07540 VALUE proc; 07541 long id, index; 07542 VALUE *aryp; 07543 07544 if (!sym_proc_cache) { 07545 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 07546 rb_gc_register_mark_object(sym_proc_cache); 07547 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 07548 } 07549 07550 id = SYM2ID(sym); 07551 index = (id % SYM_PROC_CACHE_SIZE) << 1; 07552 07553 aryp = RARRAY_PTR(sym_proc_cache); 07554 if (aryp[index] == sym) { 07555 return aryp[index + 1]; 07556 } 07557 else { 07558 proc = rb_proc_new(sym_call, (VALUE)id); 07559 aryp[index] = sym; 07560 aryp[index + 1] = proc; 07561 return proc; 07562 } 07563 } 07564 07565 /* 07566 * call-seq: 07567 * 07568 * sym.succ 07569 * 07570 * Same as <code>sym.to_s.succ.intern</code>. 07571 */ 07572 07573 static VALUE 07574 sym_succ(VALUE sym) 07575 { 07576 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 07577 } 07578 07579 /* 07580 * call-seq: 07581 * 07582 * str <=> other -> -1, 0, +1 or nil 07583 * 07584 * Compares _sym_ with _other_ in string form. 07585 */ 07586 07587 static VALUE 07588 sym_cmp(VALUE sym, VALUE other) 07589 { 07590 if (!SYMBOL_P(other)) { 07591 return Qnil; 07592 } 07593 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 07594 } 07595 07596 /* 07597 * call-seq: 07598 * 07599 * sym.casecmp(other) -> -1, 0, +1 or nil 07600 * 07601 * Case-insensitive version of <code>Symbol#<=></code>. 07602 */ 07603 07604 static VALUE 07605 sym_casecmp(VALUE sym, VALUE other) 07606 { 07607 if (!SYMBOL_P(other)) { 07608 return Qnil; 07609 } 07610 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 07611 } 07612 07613 /* 07614 * call-seq: 07615 * sym =~ obj -> fixnum or nil 07616 * 07617 * Returns <code>sym.to_s =~ obj</code>. 07618 */ 07619 07620 static VALUE 07621 sym_match(VALUE sym, VALUE other) 07622 { 07623 return rb_str_match(rb_sym_to_s(sym), other); 07624 } 07625 07626 /* 07627 * call-seq: 07628 * sym[idx] -> char 07629 * sym[b, n] -> char 07630 * 07631 * Returns <code>sym.to_s[]</code>. 07632 */ 07633 07634 static VALUE 07635 sym_aref(int argc, VALUE *argv, VALUE sym) 07636 { 07637 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 07638 } 07639 07640 /* 07641 * call-seq: 07642 * sym.length -> integer 07643 * 07644 * Same as <code>sym.to_s.length</code>. 07645 */ 07646 07647 static VALUE 07648 sym_length(VALUE sym) 07649 { 07650 return rb_str_length(rb_id2str(SYM2ID(sym))); 07651 } 07652 07653 /* 07654 * call-seq: 07655 * sym.empty? -> true or false 07656 * 07657 * Returns that _sym_ is :"" or not. 07658 */ 07659 07660 static VALUE 07661 sym_empty(VALUE sym) 07662 { 07663 return rb_str_empty(rb_id2str(SYM2ID(sym))); 07664 } 07665 07666 /* 07667 * call-seq: 07668 * sym.upcase -> symbol 07669 * 07670 * Same as <code>sym.to_s.upcase.intern</code>. 07671 */ 07672 07673 static VALUE 07674 sym_upcase(VALUE sym) 07675 { 07676 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 07677 } 07678 07679 /* 07680 * call-seq: 07681 * sym.downcase -> symbol 07682 * 07683 * Same as <code>sym.to_s.downcase.intern</code>. 07684 */ 07685 07686 static VALUE 07687 sym_downcase(VALUE sym) 07688 { 07689 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 07690 } 07691 07692 /* 07693 * call-seq: 07694 * sym.capitalize -> symbol 07695 * 07696 * Same as <code>sym.to_s.capitalize.intern</code>. 07697 */ 07698 07699 static VALUE 07700 sym_capitalize(VALUE sym) 07701 { 07702 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 07703 } 07704 07705 /* 07706 * call-seq: 07707 * sym.swapcase -> symbol 07708 * 07709 * Same as <code>sym.to_s.swapcase.intern</code>. 07710 */ 07711 07712 static VALUE 07713 sym_swapcase(VALUE sym) 07714 { 07715 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 07716 } 07717 07718 /* 07719 * call-seq: 07720 * sym.encoding -> encoding 07721 * 07722 * Returns the Encoding object that represents the encoding of _sym_. 07723 */ 07724 07725 static VALUE 07726 sym_encoding(VALUE sym) 07727 { 07728 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 07729 } 07730 07731 ID 07732 rb_to_id(VALUE name) 07733 { 07734 VALUE tmp; 07735 07736 switch (TYPE(name)) { 07737 default: 07738 tmp = rb_check_string_type(name); 07739 if (NIL_P(tmp)) { 07740 tmp = rb_inspect(name); 07741 rb_raise(rb_eTypeError, "%s is not a symbol", 07742 RSTRING_PTR(tmp)); 07743 } 07744 name = tmp; 07745 /* fall through */ 07746 case T_STRING: 07747 name = rb_str_intern(name); 07748 /* fall through */ 07749 case T_SYMBOL: 07750 return SYM2ID(name); 07751 } 07752 return Qnil; /* not reached */ 07753 } 07754 07755 /* 07756 * A <code>String</code> object holds and manipulates an arbitrary sequence of 07757 * bytes, typically representing characters. String objects may be created 07758 * using <code>String::new</code> or as literals. 07759 * 07760 * Because of aliasing issues, users of strings should be aware of the methods 07761 * that modify the contents of a <code>String</code> object. Typically, 07762 * methods with names ending in ``!'' modify their receiver, while those 07763 * without a ``!'' return a new <code>String</code>. However, there are 07764 * exceptions, such as <code>String#[]=</code>. 07765 * 07766 */ 07767 07768 void 07769 Init_String(void) 07770 { 07771 #undef rb_intern 07772 #define rb_intern(str) rb_intern_const(str) 07773 07774 rb_cString = rb_define_class("String", rb_cObject); 07775 rb_include_module(rb_cString, rb_mComparable); 07776 rb_define_alloc_func(rb_cString, str_alloc); 07777 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 07778 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 07779 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 07780 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 07781 rb_define_method(rb_cString, "==", rb_str_equal, 1); 07782 rb_define_method(rb_cString, "===", rb_str_equal, 1); 07783 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 07784 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 07785 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 07786 rb_define_method(rb_cString, "+", rb_str_plus, 1); 07787 rb_define_method(rb_cString, "*", rb_str_times, 1); 07788 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 07789 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 07790 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 07791 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 07792 rb_define_method(rb_cString, "length", rb_str_length, 0); 07793 rb_define_method(rb_cString, "size", rb_str_length, 0); 07794 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 07795 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 07796 rb_define_method(rb_cString, "=~", rb_str_match, 1); 07797 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 07798 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 07799 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 07800 rb_define_method(rb_cString, "next", rb_str_succ, 0); 07801 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 07802 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 07803 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 07804 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 07805 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 07806 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 07807 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 07808 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 07809 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 07810 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 07811 07812 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 07813 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 07814 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 07815 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 07816 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 07817 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 07818 07819 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 07820 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 07821 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 07822 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 07823 07824 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 07825 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 07826 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 07827 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 07828 07829 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 07830 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 07831 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 07832 rb_define_method(rb_cString, "lines", rb_str_each_line, -1); 07833 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0); 07834 rb_define_method(rb_cString, "chars", rb_str_each_char, 0); 07835 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0); 07836 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 07837 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 07838 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 07839 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 07840 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 07841 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 07842 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 07843 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 07844 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 07845 07846 rb_define_method(rb_cString, "include?", rb_str_include, 1); 07847 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 07848 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 07849 07850 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 07851 07852 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 07853 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 07854 rb_define_method(rb_cString, "center", rb_str_center, -1); 07855 07856 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 07857 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 07858 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 07859 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 07860 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 07861 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 07862 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 07863 07864 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 07865 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 07866 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 07867 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 07868 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 07869 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 07870 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 07871 07872 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 07873 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 07874 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 07875 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 07876 rb_define_method(rb_cString, "count", rb_str_count, -1); 07877 07878 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 07879 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 07880 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 07881 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 07882 07883 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 07884 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 07885 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 07886 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 07887 07888 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 07889 07890 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 07891 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 07892 07893 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 07894 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 07895 07896 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 07897 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 07898 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 07899 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 07900 07901 id_to_s = rb_intern("to_s"); 07902 07903 rb_fs = Qnil; 07904 rb_define_variable("$;", &rb_fs); 07905 rb_define_variable("$-F", &rb_fs); 07906 07907 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 07908 rb_include_module(rb_cSymbol, rb_mComparable); 07909 rb_undef_alloc_func(rb_cSymbol); 07910 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 07911 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 07912 07913 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 07914 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 07915 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 07916 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 07917 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 07918 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 07919 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 07920 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 07921 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 07922 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 07923 07924 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 07925 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 07926 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 07927 07928 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 07929 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 07930 rb_define_method(rb_cSymbol, "length", sym_length, 0); 07931 rb_define_method(rb_cSymbol, "size", sym_length, 0); 07932 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 07933 rb_define_method(rb_cSymbol, "match", sym_match, 1); 07934 07935 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 07936 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 07937 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 07938 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 07939 07940 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 07941 } 07942
1.7.6.1