sql_string.cc 29.5 KB
Newer Older
unknown's avatar
unknown committed
1 2 3 4
/* Copyright (C) 2000 MySQL AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
unknown's avatar
unknown committed
5
   the Free Software Foundation; version 2 of the License.
unknown's avatar
unknown committed
6 7

   This program is distributed in the hope that it will be useful,
unknown's avatar
unknown committed
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
unknown's avatar
unknown committed
9 10 11 12 13 14
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
unknown's avatar
unknown committed
15 16 17

/* This file is originally from the mysql distribution. Coded by monty */

18
#ifdef USE_PRAGMA_IMPLEMENTATION
unknown's avatar
unknown committed
19 20 21
#pragma implementation				// gcc: Class implementation
#endif

22
#include <my_global.h>
unknown's avatar
unknown committed
23 24 25
#include <my_sys.h>
#include <m_string.h>
#include <m_ctype.h>
26
#include <mysql_com.h>
27 28 29 30 31
/*
  The following extern declarations are ok as these are interface functions
  required by the string function
*/

32
extern uchar* sql_alloc(unsigned size);
unknown's avatar
unknown committed
33 34 35 36 37 38 39 40 41 42 43
extern void sql_element_free(void *ptr);

#include "sql_string.h"

/*****************************************************************************
** String functions
*****************************************************************************/

bool String::real_alloc(uint32 arg_length)
{
  arg_length=ALIGN_SIZE(arg_length+1);
44
  str_length=0;
unknown's avatar
unknown committed
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
  if (Alloced_length < arg_length)
  {
    free();
    if (!(Ptr=(char*) my_malloc(arg_length,MYF(MY_WME))))
      return TRUE;
    Alloced_length=arg_length;
    alloced=1;
  }
  Ptr[0]=0;
  return FALSE;
}


/*
** Check that string is big enough. Set string[alloc_length] to 0
** (for C functions)
*/

bool String::realloc(uint32 alloc_length)
{
  uint32 len=ALIGN_SIZE(alloc_length+1);
  if (Alloced_length < len)
  {
    char *new_ptr;
    if (alloced)
    {
71 72
      if (!(new_ptr= (char*) my_realloc(Ptr,len,MYF(MY_WME))))
        return TRUE;				// Signal error
unknown's avatar
unknown committed
73 74 75
    }
    else if ((new_ptr= (char*) my_malloc(len,MYF(MY_WME))))
    {
76 77
      if (str_length > len - 1)
        str_length= 0;
78 79
      if (str_length)				// Avoid bugs in memcpy on AIX
	memcpy(new_ptr,Ptr,str_length);
unknown's avatar
unknown committed
80 81 82 83 84
      new_ptr[str_length]=0;
      alloced=1;
    }
    else
      return TRUE;			// Signal error
85 86
    Ptr= new_ptr;
    Alloced_length= len;
unknown's avatar
unknown committed
87 88 89 90 91
  }
  Ptr[alloc_length]=0;			// This make other funcs shorter
  return FALSE;
}

92
bool String::set_int(longlong num, bool unsigned_flag, CHARSET_INFO *cs)
unknown's avatar
unknown committed
93
{
94
  uint l=20*cs->mbmaxlen+1;
unknown's avatar
unknown committed
95
  int base= unsigned_flag ? 10 : -10;
96 97

  if (alloc(l))
unknown's avatar
unknown committed
98
    return TRUE;
unknown's avatar
unknown committed
99
  str_length=(uint32) (cs->cset->longlong10_to_str)(cs,Ptr,l,base,num);
100
  str_charset=cs;
unknown's avatar
unknown committed
101 102 103
  return FALSE;
}

104
bool String::set_real(double num,uint decimals, CHARSET_INFO *cs)
unknown's avatar
unknown committed
105
{
106
  char buff[FLOATING_POINT_BUFFER];
107
  uint dummy_errors;
108
  size_t len;
109 110

  str_charset=cs;
unknown's avatar
unknown committed
111 112
  if (decimals >= NOT_FIXED_DEC)
  {
113
    len= my_gcvt(num, MY_GCVT_ARG_DOUBLE, sizeof(buff) - 1, buff, NULL);
114
    return copy(buff, len, &my_charset_latin1, cs, &dummy_errors);
unknown's avatar
unknown committed
115
  }
116 117
  len= my_fcvt(num, decimals, buff, NULL);
  return copy(buff, (uint32) len, &my_charset_latin1, cs,
118
              &dummy_errors);
unknown's avatar
unknown committed
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
}


bool String::copy()
{
  if (!alloced)
  {
    Alloced_length=0;				// Force realloc
    return realloc(str_length);
  }
  return FALSE;
}

bool String::copy(const String &str)
{
  if (alloc(str.str_length))
    return TRUE;
  str_length=str.str_length;
  bmove(Ptr,str.Ptr,str_length);		// May be overlapping
  Ptr[str_length]=0;
139
  str_charset=str.str_charset;
unknown's avatar
unknown committed
140 141 142
  return FALSE;
}

143
bool String::copy(const char *str,uint32 arg_length, CHARSET_INFO *cs)
unknown's avatar
unknown committed
144 145 146
{
  if (alloc(arg_length))
    return TRUE;
147 148
  if ((str_length=arg_length))
    memcpy(Ptr,str,arg_length);
unknown's avatar
unknown committed
149
  Ptr[arg_length]=0;
150
  str_charset=cs;
unknown's avatar
unknown committed
151 152 153
  return FALSE;
}

154 155

/*
unknown's avatar
unknown committed
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
  Checks that the source string can be just copied to the destination string
  without conversion.

  SYNPOSIS

  needs_conversion()
  arg_length		Length of string to copy.
  from_cs		Character set to copy from
  to_cs			Character set to copy to
  uint32 *offset	Returns number of unaligned characters.

  RETURN
   0  No conversion needed
   1  Either character set conversion or adding leading  zeros
      (e.g. for UCS-2) must be done
171 172 173 174

  NOTE
  to_cs may be NULL for "no conversion" if the system variable
  character_set_results is NULL.
175
*/
unknown's avatar
unknown committed
176 177 178 179 180

bool String::needs_conversion(uint32 arg_length,
			      CHARSET_INFO *from_cs,
			      CHARSET_INFO *to_cs,
			      uint32 *offset)
181
{
unknown's avatar
unknown committed
182
  *offset= 0;
183 184
  if (!to_cs ||
      (to_cs == &my_charset_bin) || 
unknown's avatar
unknown committed
185 186
      (to_cs == from_cs) ||
      my_charset_same(from_cs, to_cs) ||
unknown's avatar
unknown committed
187 188
      ((from_cs == &my_charset_bin) &&
       (!(*offset=(arg_length % to_cs->mbminlen)))))
189 190 191 192
    return FALSE;
  return TRUE;
}

unknown's avatar
unknown committed
193

unknown's avatar
unknown committed
194
/*
unknown's avatar
unknown committed
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
  Copy a multi-byte character sets with adding leading zeros.

  SYNOPSIS

  copy_aligned()
  str			String to copy
  arg_length		Length of string. This should NOT be dividable with
			cs->mbminlen.
  offset		arg_length % cs->mb_minlength
  cs			Character set for 'str'

  NOTES
    For real multi-byte, ascii incompatible charactser sets,
    like UCS-2, add leading zeros if we have an incomplete character.
    Thus, 
      SELECT _ucs2 0xAA 
    will automatically be converted into
      SELECT _ucs2 0x00AA

  RETURN
    0  ok
    1  error
unknown's avatar
unknown committed
217 218
*/

unknown's avatar
unknown committed
219
bool String::copy_aligned(const char *str,uint32 arg_length, uint32 offset,
220
			  CHARSET_INFO *cs)
unknown's avatar
unknown committed
221 222
{
  /* How many bytes are in incomplete character */
unknown's avatar
unknown committed
223 224
  offset= cs->mbmaxlen - offset; /* How many zeros we should prepend */
  DBUG_ASSERT(offset && offset != cs->mbmaxlen);
unknown's avatar
unknown committed
225

unknown's avatar
unknown committed
226
  uint32 aligned_length= arg_length + offset;
unknown's avatar
unknown committed
227 228 229 230
  if (alloc(aligned_length))
    return TRUE;
  
  /*
unknown's avatar
unknown committed
231 232
    Note, this is only safe for big-endian UCS-2.
    If we add little-endian UCS-2 sometimes, this code
unknown's avatar
unknown committed
233
    will be more complicated. But it's OK for now.
unknown's avatar
unknown committed
234
  */
unknown's avatar
unknown committed
235 236
  bzero((char*) Ptr, offset);
  memcpy(Ptr + offset, str, arg_length);
unknown's avatar
unknown committed
237
  Ptr[aligned_length]=0;
unknown's avatar
unknown committed
238 239 240
  /* str_length is always >= 0 as arg_length is != 0 */
  str_length= aligned_length;
  str_charset= cs;
unknown's avatar
unknown committed
241 242 243
  return FALSE;
}

244 245 246 247 248

bool String::set_or_copy_aligned(const char *str,uint32 arg_length,
				 CHARSET_INFO *cs)
{
  /* How many bytes are in incomplete character */
unknown's avatar
unknown committed
249
  uint32 offset= (arg_length % cs->mbminlen); 
250
  
unknown's avatar
unknown committed
251
  if (!offset) /* All characters are complete, just copy */
252 253 254 255
  {
    set(str, arg_length, cs);
    return FALSE;
  }
unknown's avatar
unknown committed
256
  return copy_aligned(str, arg_length, offset, cs);
257 258
}

259 260 261 262 263 264 265 266 267 268

/**
   Copies the character data into this String, with optional character set
   conversion.

   @return
   FALSE ok
   TRUE  Could not allocate result buffer

*/
269

270
bool String::copy(const char *str, uint32 arg_length,
271
		  CHARSET_INFO *from_cs, CHARSET_INFO *to_cs, uint *errors)
272
{
unknown's avatar
unknown committed
273
  uint32 offset;
274 275

  DBUG_ASSERT(!str || str != Ptr);
276
  
unknown's avatar
unknown committed
277
  if (!needs_conversion(arg_length, from_cs, to_cs, &offset))
278
  {
279
    *errors= 0;
280
    return copy(str, arg_length, to_cs);
281
  }
unknown's avatar
unknown committed
282
  if ((from_cs == &my_charset_bin) && offset)
283
  {
284
    *errors= 0;
unknown's avatar
unknown committed
285
    return copy_aligned(str, arg_length, offset, to_cs);
286
  }
287
  uint32 new_length= to_cs->mbmaxlen*arg_length;
288 289
  if (alloc(new_length))
    return TRUE;
290
  str_length=copy_and_convert((char*) Ptr, new_length, to_cs,
291
                              str, arg_length, from_cs, errors);
292 293 294
  str_charset=to_cs;
  return FALSE;
}
295

296 297 298

/*
  Set a string to the value of a latin1-string, keeping the original charset
299
  
300 301 302 303
  SYNOPSIS
    copy_or_set()
    str			String of a simple charset (latin1)
    arg_length		Length of string
304

305 306 307 308 309 310 311 312 313 314 315
  IMPLEMENTATION
    If string object is of a simple character set, set it to point to the
    given string.
    If not, make a copy and convert it to the new character set.

  RETURN
    0	ok
    1	Could not allocate result buffer

*/

316
bool String::set_ascii(const char *str, uint32 arg_length)
317
{
unknown's avatar
unknown committed
318
  if (str_charset->mbminlen == 1)
319 320 321
  {
    set(str, arg_length, str_charset);
    return 0;
322
  }
323 324
  uint dummy_errors;
  return copy(str, arg_length, &my_charset_latin1, str_charset, &dummy_errors);
325 326
}

327

unknown's avatar
unknown committed
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
/* This is used by mysql.cc */

bool String::fill(uint32 max_length,char fill_char)
{
  if (str_length > max_length)
    Ptr[str_length=max_length]=0;
  else
  {
    if (realloc(max_length))
      return TRUE;
    bfill(Ptr+str_length,max_length-str_length,fill_char);
    str_length=max_length;
  }
  return FALSE;
}

void String::strip_sp()
{
346
   while (str_length && my_isspace(str_charset,Ptr[str_length-1]))
unknown's avatar
unknown committed
347 348 349 350 351
    str_length--;
}

bool String::append(const String &s)
{
352 353 354 355 356 357 358
  if (s.length())
  {
    if (realloc(str_length+s.length()))
      return TRUE;
    memcpy(Ptr+str_length,s.ptr(),s.length());
    str_length+=s.length();
  }
unknown's avatar
unknown committed
359 360 361
  return FALSE;
}

362 363

/*
364
  Append an ASCII string to the a string of the current character set
365 366
*/

unknown's avatar
unknown committed
367 368
bool String::append(const char *s,uint32 arg_length)
{
369 370 371 372 373 374 375
  if (!arg_length)
    return FALSE;

  /*
    For an ASCII incompatible string, e.g. UCS-2, we need to convert
  */
  if (str_charset->mbminlen > 1)
376 377
  {
    uint32 add_length=arg_length * str_charset->mbmaxlen;
378
    uint dummy_errors;
379 380 381
    if (realloc(str_length+ add_length))
      return TRUE;
    str_length+= copy_and_convert(Ptr+str_length, add_length, str_charset,
382 383
				  s, arg_length, &my_charset_latin1,
                                  &dummy_errors);
384 385
    return FALSE;
  }
386 387 388 389

  /*
    For an ASCII compatinble string we can just append.
  */
unknown's avatar
unknown committed
390 391 392 393 394 395 396
  if (realloc(str_length+arg_length))
    return TRUE;
  memcpy(Ptr+str_length,s,arg_length);
  str_length+=arg_length;
  return FALSE;
}

397

398 399 400 401 402 403
/*
  Append a 0-terminated ASCII string
*/

bool String::append(const char *s)
{
404
  return append(s, (uint) strlen(s));
405 406 407
}


408 409 410 411 412 413 414
/*
  Append a string in the given charset to the string
  with character set recoding
*/

bool String::append(const char *s,uint32 arg_length, CHARSET_INFO *cs)
{
Alexander Barkov's avatar
Alexander Barkov committed
415
  uint32 offset;
416
  
Alexander Barkov's avatar
Alexander Barkov committed
417
  if (needs_conversion(arg_length, cs, str_charset, &offset))
418
  {
Alexander Barkov's avatar
Alexander Barkov committed
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
    uint32 add_length;
    if ((cs == &my_charset_bin) && offset)
    {
      DBUG_ASSERT(str_charset->mbminlen > offset);
      offset= str_charset->mbminlen - offset; // How many characters to pad
      add_length= arg_length + offset;
      if (realloc(str_length + add_length))
        return TRUE;
      bzero((char*) Ptr + str_length, offset);
      memcpy(Ptr + str_length + offset, s, arg_length);
      str_length+= add_length;
      return FALSE;
    }

    add_length= arg_length / cs->mbminlen * str_charset->mbmaxlen;
434
    uint dummy_errors;
435
    if (realloc(str_length + add_length)) 
436 437
      return TRUE;
    str_length+= copy_and_convert(Ptr+str_length, add_length, str_charset,
438
				  s, arg_length, cs, &dummy_errors);
439
  }
440 441
  else
  {
442 443
    if (realloc(str_length + arg_length)) 
      return TRUE;
444 445 446
    memcpy(Ptr + str_length, s, arg_length);
    str_length+= arg_length;
  }
447 448 449
  return FALSE;
}

450 451 452 453
bool String::append(IO_CACHE* file, uint32 arg_length)
{
  if (realloc(str_length+arg_length))
    return TRUE;
454
  if (my_b_read(file, (uchar*) Ptr + str_length, arg_length))
455 456 457 458 459 460 461
  {
    shrink(str_length);
    return TRUE;
  }
  str_length+=arg_length;
  return FALSE;
}
unknown's avatar
unknown committed
462

unknown's avatar
unknown committed
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
bool String::append_with_prefill(const char *s,uint32 arg_length,
		 uint32 full_length, char fill_char)
{
  int t_length= arg_length > full_length ? arg_length : full_length;

  if (realloc(str_length + t_length))
    return TRUE;
  t_length= full_length - arg_length;
  if (t_length > 0)
  {
    bfill(Ptr+str_length, t_length, fill_char);
    str_length=str_length + t_length;
  }
  append(s, arg_length);
  return FALSE;
}

unknown's avatar
unknown committed
480 481
uint32 String::numchars()
{
482
  return str_charset->cset->numchars(str_charset, Ptr, Ptr+str_length);
unknown's avatar
unknown committed
483 484 485 486
}

int String::charpos(int i,uint32 offset)
{
487 488
  if (i <= 0)
    return i;
489
  return str_charset->cset->charpos(str_charset,Ptr+offset,Ptr+str_length,i);
unknown's avatar
unknown committed
490 491 492 493 494 495 496
}

int String::strstr(const String &s,uint32 offset)
{
  if (s.length()+offset <= str_length)
  {
    if (!s.length())
unknown's avatar
unknown committed
497
      return ((int) offset);	// Empty string is always found
unknown's avatar
unknown committed
498 499 500 501 502

    register const char *str = Ptr+offset;
    register const char *search=s.ptr();
    const char *end=Ptr+str_length-s.length()+1;
    const char *search_end=s.ptr()+s.length();
503
skip:
unknown's avatar
unknown committed
504 505 506 507 508 509 510
    while (str != end)
    {
      if (*str++ == *search)
      {
	register char *i,*j;
	i=(char*) str; j=(char*) search+1;
	while (j != search_end)
511
	  if (*i++ != *j++) goto skip;
unknown's avatar
unknown committed
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
	return (int) (str-Ptr) -1;
      }
    }
  }
  return -1;
}

/*
** Search string from end. Offset is offset to the end of string
*/

int String::strrstr(const String &s,uint32 offset)
{
  if (s.length() <= offset && offset <= str_length)
  {
    if (!s.length())
      return offset;				// Empty string is always found
    register const char *str = Ptr+offset-1;
    register const char *search=s.ptr()+s.length()-1;

    const char *end=Ptr+s.length()-2;
    const char *search_end=s.ptr()-1;
534
skip:
unknown's avatar
unknown committed
535 536 537 538 539 540 541
    while (str != end)
    {
      if (*str-- == *search)
      {
	register char *i,*j;
	i=(char*) str; j=(char*) search-1;
	while (j != search_end)
542
	  if (*i-- != *j--) goto skip;
unknown's avatar
unknown committed
543 544 545 546 547 548 549 550
	return (int) (i-Ptr) +1;
      }
    }
  }
  return -1;
}

/*
551 552
  Replace substring with string
  If wrong parameter or not enough memory, do nothing
unknown's avatar
unknown committed
553 554 555 556
*/

bool String::replace(uint32 offset,uint32 arg_length,const String &to)
{
557 558 559 560
  return replace(offset,arg_length,to.ptr(),to.length());
}

bool String::replace(uint32 offset,uint32 arg_length,
561
                     const char *to, uint32 to_length)
562
{
563
  long diff = (long) to_length-(long) arg_length;
unknown's avatar
unknown committed
564 565 566 567
  if (offset+arg_length <= str_length)
  {
    if (diff < 0)
    {
568 569 570
      if (to_length)
	memcpy(Ptr+offset,to,to_length);
      bmove(Ptr+offset+to_length,Ptr+offset+arg_length,
unknown's avatar
unknown committed
571 572 573 574 575 576 577 578
	    str_length-offset-arg_length);
    }
    else
    {
      if (diff)
      {
	if (realloc(str_length+(uint32) diff))
	  return TRUE;
579
	bmove_upp((uchar*) Ptr+str_length+diff, (uchar*) Ptr+str_length,
unknown's avatar
unknown committed
580 581
		  str_length-offset-arg_length);
      }
582 583
      if (to_length)
	memcpy(Ptr+offset,to,to_length);
unknown's avatar
unknown committed
584 585 586 587 588 589
    }
    str_length+=(uint32) diff;
  }
  return FALSE;
}

590

unknown's avatar
unknown committed
591 592 593 594 595 596 597 598 599 600 601
// added by Holyfoot for "geometry" needs
int String::reserve(uint32 space_needed, uint32 grow_by)
{
  if (Alloced_length < str_length + space_needed)
  {
    if (realloc(Alloced_length + max(space_needed, grow_by) - 1))
      return TRUE;
  }
  return FALSE;
}

unknown's avatar
unknown committed
602
void String::qs_append(const char *str, uint32 len)
unknown's avatar
unknown committed
603 604 605 606 607 608 609 610
{
  memcpy(Ptr + str_length, str, len + 1);
  str_length += len;
}

void String::qs_append(double d)
{
  char *buff = Ptr + str_length;
611 612
  str_length+= my_gcvt(d, MY_GCVT_ARG_DOUBLE, FLOATING_POINT_BUFFER - 1, buff,
                       NULL);
unknown's avatar
unknown committed
613 614 615 616 617
}

void String::qs_append(double *d)
{
  double ld;
unknown's avatar
unknown committed
618
  float8get(ld, (char*) d);
unknown's avatar
unknown committed
619 620 621
  qs_append(ld);
}

622 623
void String::qs_append(int i)
{
unknown's avatar
unknown committed
624 625 626
  char *buff= Ptr + str_length;
  char *end= int10_to_str(i, buff, -10);
  str_length+= (int) (end-buff);
627 628 629 630
}

void String::qs_append(uint i)
{
unknown's avatar
unknown committed
631 632 633
  char *buff= Ptr + str_length;
  char *end= int10_to_str(i, buff, 10);
  str_length+= (int) (end-buff);
634 635
}

unknown's avatar
unknown committed
636 637 638 639 640 641 642 643 644 645 646
/*
  Compare strings according to collation, without end space.

  SYNOPSIS
    sortcmp()
    s		First string
    t		Second string
    cs		Collation

  NOTE:
    Normally this is case sensitive comparison
unknown's avatar
unknown committed
647

unknown's avatar
unknown committed
648 649 650 651 652 653 654 655
  RETURN
  < 0	s < t
  0	s == t
  > 0	s > t
*/


int sortcmp(const String *s,const String *t, CHARSET_INFO *cs)
unknown's avatar
unknown committed
656
{
unknown's avatar
unknown committed
657
 return cs->coll->strnncollsp(cs,
unknown's avatar
unknown committed
658 659
                              (uchar *) s->ptr(),s->length(),
                              (uchar *) t->ptr(),t->length(), 0);
unknown's avatar
unknown committed
660 661 662 663 664 665 666 667 668 669 670 671
}


/*
  Compare strings byte by byte. End spaces are also compared.

  SYNOPSIS
    stringcmp()
    s		First string
    t		Second string

  NOTE:
unknown's avatar
unknown committed
672
    Strings are compared as a stream of uchars
unknown's avatar
unknown committed
673 674 675 676 677 678 679 680 681

  RETURN
  < 0	s < t
  0	s == t
  > 0	s > t
*/


int stringcmp(const String *s,const String *t)
unknown's avatar
unknown committed
682
{
unknown's avatar
unknown committed
683 684 685
  uint32 s_len=s->length(),t_len=t->length(),len=min(s_len,t_len);
  int cmp= memcmp(s->ptr(), t->ptr(), len);
  return (cmp) ? cmp : (int) (s_len - t_len);
unknown's avatar
unknown committed
686 687 688 689 690 691 692 693 694 695 696 697 698 699
}


String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
{
  if (from->Alloced_length >= from_length)
    return from;
  if (from->alloced || !to || from == to)
  {
    (void) from->realloc(from_length);
    return from;
  }
  if (to->realloc(from_length))
    return from;				// Actually an error
700 701
  if ((to->str_length=min(from->str_length,from_length)))
    memcpy(to->Ptr,from->Ptr,to->str_length);
702
  to->str_charset=from->str_charset;
unknown's avatar
unknown committed
703 704 705 706
  return to;
}


707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
/****************************************************************************
  Help functions
****************************************************************************/

/*
  copy a string from one character set to another
  
  SYNOPSIS
    copy_and_convert()
    to			Store result here
    to_cs		Character set of result string
    from		Copy from here
    from_length		Length of from string
    from_cs		From character set

  NOTES
    'to' must be big enough as form_length * to_cs->mbmaxlen

  RETURN
    length of bytes copied to 'to'
*/

unknown's avatar
unknown committed
729

730 731 732 733 734
static uint32
copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
                          const char *from, uint32 from_length,
                          CHARSET_INFO *from_cs,
                          uint *errors)
735 736 737 738 739 740
{
  int         cnvres;
  my_wc_t     wc;
  const uchar *from_end= (const uchar*) from+from_length;
  char *to_start= to;
  uchar *to_end= (uchar*) to+to_length;
741 742
  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
743
  uint error_count= 0;
744

745
  while (1)
746
  {
747
    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
unknown's avatar
unknown committed
748
				      from_end)) > 0)
749 750 751
      from+= cnvres;
    else if (cnvres == MY_CS_ILSEQ)
    {
752
      error_count++;
753 754 755
      from++;
      wc= '?';
    }
756 757 758 759 760 761 762 763 764 765
    else if (cnvres > MY_CS_TOOSMALL)
    {
      /*
        A correct multibyte sequence detected
        But it doesn't have Unicode mapping.
      */
      error_count++;
      from+= (-cnvres);
      wc= '?';
    }
766
    else
767
      break;  // Not enough characters
768 769

outp:
770
    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
771 772 773
      to+= cnvres;
    else if (cnvres == MY_CS_ILUNI && wc != '?')
    {
774
      error_count++;
775 776 777 778 779 780
      wc= '?';
      goto outp;
    }
    else
      break;
  }
781
  *errors= error_count;
782 783
  return (uint32) (to - to_start);
}
unknown's avatar
unknown committed
784

785

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
/*
  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
*/
uint32
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
                 const char *from, uint32 from_length, CHARSET_INFO *from_cs,
                 uint *errors)
{
  /*
    If any of the character sets is not ASCII compatible,
    immediately switch to slow mb_wc->wc_mb method.
  */
  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
    return copy_and_convert_extended(to, to_length, to_cs,
                                     from, from_length, from_cs, errors);

  uint32 length= min(to_length, from_length), length2= length;

#if defined(__i386__)
  /*
    Special loop for i386, it allows to refer to a
    non-aligned memory block as UINT32, which makes
    it possible to copy four bytes at once. This
    gives about 10% performance improvement comparing
    to byte-by-byte loop.
  */
  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
  {
    if ((*(uint32*)from) & 0x80808080)
      break;
    *((uint32*) to)= *((const uint32*) from);
  }
#endif

  for (; ; *to++= *from++, length--)
  {
    if (!length)
    {
      *errors= 0;
      return length2;
    }
    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
    {
      uint32 copied_length= length2 - length;
      to_length-= copied_length;
      from_length-= copied_length;
      return copied_length + copy_and_convert_extended(to, to_length,
                                                       to_cs,
                                                       from, from_length,
                                                       from_cs,
                                                       errors);
    }
  }

  DBUG_ASSERT(FALSE); // Should never get to here
  return 0;           // Make compiler happy
}


845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
/**
  Copy string with HEX-encoding of "bad" characters.

  @details This functions copies the string pointed by "src"
  to the string pointed by "dst". Not more than "srclen" bytes
  are read from "src". Any sequences of bytes representing
  a not-well-formed substring (according to cs) are hex-encoded,
  and all well-formed substrings (according to cs) are copied as is.
  Not more than "dstlen" bytes are written to "dst". The number 
  of bytes written to "dst" is returned.
  
   @param      cs       character set pointer of the destination string
   @param[out] dst      destination string
   @param      dstlen   size of dst
   @param      src      source string
   @param      srclen   length of src

   @retval     result length
*/

size_t
my_copy_with_hex_escaping(CHARSET_INFO *cs,
                          char *dst, size_t dstlen,
                          const char *src, size_t srclen)
{
  const char *srcend= src + srclen;
  char *dst0= dst;

  for ( ; src < srcend ; )
  {
    size_t chlen;
    if ((chlen= my_ismbchar(cs, src, srcend)))
    {
      if (dstlen < chlen)
        break; /* purecov: inspected */
      memcpy(dst, src, chlen);
      src+= chlen;
      dst+= chlen;
      dstlen-= chlen;
    }
    else if (*src & 0x80)
    {
      if (dstlen < 4)
        break; /* purecov: inspected */
      *dst++= '\\';
      *dst++= 'x';
      *dst++= _dig_vec_upper[((unsigned char) *src) >> 4];
      *dst++= _dig_vec_upper[((unsigned char) *src) & 15];
      src++;
      dstlen-= 4;
    }
    else
    {
      if (dstlen < 1)
        break; /* purecov: inspected */
      *dst++= *src++;
      dstlen--;
    }
  }
  return dst - dst0;
}

907 908 909 910 911 912
/*
  copy a string,
  with optional character set conversion,
  with optional left padding (for binary -> UCS2 conversion)
  
  SYNOPSIS
913
    well_formed_copy_nchars()
914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
    to			     Store result here
    to_length                Maxinum length of "to" string
    to_cs		     Character set of "to" string
    from		     Copy from here
    from_length		     Length of from string
    from_cs		     From character set
    nchars                   Copy not more that nchars characters
    well_formed_error_pos    Return position when "from" is not well formed
                             or NULL otherwise.
    cannot_convert_error_pos Return position where a not convertable
                             character met, or NULL otherwise.
    from_end_pos             Return position where scanning of "from"
                             string stopped.
  NOTES

  RETURN
    length of bytes copied to 'to'
*/


uint32
well_formed_copy_nchars(CHARSET_INFO *to_cs,
                        char *to, uint to_length,
                        CHARSET_INFO *from_cs,
                        const char *from, uint from_length,
                        uint nchars,
                        const char **well_formed_error_pos,
                        const char **cannot_convert_error_pos,
                        const char **from_end_pos)
{
  uint res;

  if ((to_cs == &my_charset_bin) || 
      (from_cs == &my_charset_bin) ||
      (to_cs == from_cs) ||
      my_charset_same(from_cs, to_cs))
  {
    if (to_length < to_cs->mbminlen || !nchars)
    {
      *from_end_pos= from;
      *cannot_convert_error_pos= NULL;
      *well_formed_error_pos= NULL;
      return 0;
    }

    if (to_cs == &my_charset_bin)
    {
      res= min(min(nchars, to_length), from_length);
      memmove(to, from, res);
      *from_end_pos= from + res;
      *well_formed_error_pos= NULL;
      *cannot_convert_error_pos= NULL;
    }
    else
    {
      int well_formed_error;
      uint from_offset;

      if ((from_offset= (from_length % to_cs->mbminlen)) &&
          (from_cs == &my_charset_bin))
      {
        /*
          Copying from BINARY to UCS2 needs to prepend zeros sometimes:
          INSERT INTO t1 (ucs2_column) VALUES (0x01);
          0x01 -> 0x0001
        */
        uint pad_length= to_cs->mbminlen - from_offset;
        bzero(to, pad_length);
        memmove(to + pad_length, from, from_offset);
Alexander Barkov's avatar
Alexander Barkov committed
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
        /*
          In some cases left zero-padding can create an incorrect character.
          For example:
            INSERT INTO t1 (utf32_column) VALUES (0x110000);
          We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
          The valid characters range is limited to 0x00000000..0x0010FFFF.
          
          Make sure we didn't pad to an incorrect character.
        */
        if (to_cs->cset->well_formed_len(to_cs,
                                         to, to + to_cs->mbminlen, 1,
                                         &well_formed_error) !=
                                         to_cs->mbminlen)
        {
          *from_end_pos= *well_formed_error_pos= from;
          *cannot_convert_error_pos= NULL;
          return 0;
        }
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
        nchars--;
        from+= from_offset;
        from_length-= from_offset;
        to+= to_cs->mbminlen;
        to_length-= to_cs->mbminlen;
      }

      set_if_smaller(from_length, to_length);
      res= to_cs->cset->well_formed_len(to_cs, from, from + from_length,
                                        nchars, &well_formed_error);
      memmove(to, from, res);
      *from_end_pos= from + res;
      *well_formed_error_pos= well_formed_error ? from + res : NULL;
      *cannot_convert_error_pos= NULL;
      if (from_offset)
        res+= to_cs->mbminlen;
    }
  }
  else
  {
    int cnvres;
    my_wc_t wc;
1023 1024
    my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
    my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
    const uchar *from_end= (const uchar*) from + from_length;
    uchar *to_end= (uchar*) to + to_length;
    char *to_start= to;
    *well_formed_error_pos= NULL;
    *cannot_convert_error_pos= NULL;

    for ( ; nchars; nchars--)
    {
      const char *from_prev= from;
      if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
        from+= cnvres;
      else if (cnvres == MY_CS_ILSEQ)
      {
        if (!*well_formed_error_pos)
          *well_formed_error_pos= from;
        from++;
        wc= '?';
      }
      else if (cnvres > MY_CS_TOOSMALL)
      {
        /*
          A correct multibyte sequence detected
          But it doesn't have Unicode mapping.
        */
        if (!*cannot_convert_error_pos)
          *cannot_convert_error_pos= from;
        from+= (-cnvres);
        wc= '?';
      }
      else
        break;  // Not enough characters

outp:
      if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
        to+= cnvres;
      else if (cnvres == MY_CS_ILUNI && wc != '?')
      {
        if (!*cannot_convert_error_pos)
          *cannot_convert_error_pos= from_prev;
        wc= '?';
        goto outp;
      }
      else
1068 1069
      {
        from= from_prev;
1070
        break;
1071
      }
1072 1073
    }
    *from_end_pos= from;
1074
    res= (uint) (to - to_start);
1075 1076 1077 1078 1079 1080 1081
  }
  return (uint32) res;
}




unknown's avatar
unknown committed
1082 1083 1084
void String::print(String *str)
{
  char *st= (char*)Ptr, *end= st+str_length;
1085
  for (; st < end; st++)
unknown's avatar
unknown committed
1086 1087 1088 1089 1090
  {
    uchar c= *st;
    switch (c)
    {
    case '\\':
1091
      str->append(STRING_WITH_LEN("\\\\"));
unknown's avatar
unknown committed
1092 1093
      break;
    case '\0':
1094
      str->append(STRING_WITH_LEN("\\0"));
unknown's avatar
unknown committed
1095 1096
      break;
    case '\'':
1097
      str->append(STRING_WITH_LEN("\\'"));
unknown's avatar
unknown committed
1098 1099
      break;
    case '\n':
1100
      str->append(STRING_WITH_LEN("\\n"));
unknown's avatar
unknown committed
1101 1102
      break;
    case '\r':
1103
      str->append(STRING_WITH_LEN("\\r"));
unknown's avatar
unknown committed
1104
      break;
1105 1106
    case '\032': // Ctrl-Z
      str->append(STRING_WITH_LEN("\\Z"));
unknown's avatar
unknown committed
1107 1108 1109 1110 1111 1112
      break;
    default:
      str->append(c);
    }
  }
}
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132


/*
  Exchange state of this object and argument.

  SYNOPSIS
    String::swap()

  RETURN
    Target string will contain state of this object and vice versa.
*/

void String::swap(String &s)
{
  swap_variables(char *, Ptr, s.Ptr);
  swap_variables(uint32, str_length, s.str_length);
  swap_variables(uint32, Alloced_length, s.Alloced_length);
  swap_variables(bool, alloced, s.alloced);
  swap_variables(CHARSET_INFO*, str_charset, s.str_charset);
}
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205


/**
  Convert string to printable ASCII string

  @details This function converts input string "from" replacing non-ASCII bytes
  with hexadecimal sequences ("\xXX") optionally appending "..." to the end of
  the resulting string.
  This function used in the ER_TRUNCATED_WRONG_VALUE_FOR_FIELD error messages,
  e.g. when a string cannot be converted to a result charset.


  @param    to          output buffer
  @param    to_len      size of the output buffer (8 bytes or greater)
  @param    from        input string
  @param    from_len    size of the input string
  @param    from_cs     input charset
  @param    nbytes      maximal number of bytes to convert (from_len if 0)

  @return   number of bytes in the output string
*/

uint convert_to_printable(char *to, size_t to_len,
                          const char *from, size_t from_len,
                          CHARSET_INFO *from_cs, size_t nbytes /*= 0*/)
{
  /* needs at least 8 bytes for '\xXX...' and zero byte */
  DBUG_ASSERT(to_len >= 8);

  char *t= to;
  char *t_end= to + to_len - 1; // '- 1' is for the '\0' at the end
  const char *f= from;
  const char *f_end= from + (nbytes ? min(from_len, nbytes) : from_len);
  char *dots= to; // last safe place to append '...'

  if (!f || t == t_end)
    return 0;

  for (; t < t_end && f < f_end; f++)
  {
    /*
      If the source string is ASCII compatible (mbminlen==1)
      and the source character is in ASCII printable range (0x20..0x7F),
      then display the character as is.
      
      Otherwise, if the source string is not ASCII compatible (e.g. UCS2),
      or the source character is not in the printable range,
      then print the character using HEX notation.
    */
    if (((unsigned char) *f) >= 0x20 &&
        ((unsigned char) *f) <= 0x7F &&
        from_cs->mbminlen == 1)
    {
      *t++= *f;
    }
    else
    {
      if (t_end - t < 4) // \xXX
        break;
      *t++= '\\';
      *t++= 'x';
      *t++= _dig_vec_upper[((unsigned char) *f) >> 4];
      *t++= _dig_vec_upper[((unsigned char) *f) & 0x0F];
    }
    if (t_end - t >= 3) // '...'
      dots= t;
  }
  if (f < from + from_len)
    memcpy(dots, STRING_WITH_LEN("...\0"));
  else
    *t= '\0';
  return t - to;
}