유니코드 To UTF-8

2024. 5. 2. 09:02
#include <iostream>

using namespace std;


int utf8_to_unicode(string utf8_code);
string unicode_to_utf8(int unicode);
string unicode_table(int min, int max, string separator, int elements_in_row);


int main()
{
  cout << unicode_table(32, 10240, "", 32) << endl;
 
 cout << unicode_to_utf8(36) << '\t';
 cout << unicode_to_utf8(162) << '\t';
 cout << unicode_to_utf8(8364) << '\t';
 cout << unicode_to_utf8(128578) << endl;
 
 cout << unicode_to_utf8(0x24) << '\t';
 cout << unicode_to_utf8(0xa2) << '\t';
 cout << unicode_to_utf8(0x20ac) << '\t';
 cout << unicode_to_utf8(0x1f642) << endl;
 
 cout << utf8_to_unicode("$") << '\t';
 cout << utf8_to_unicode("¢") << '\t';
 cout << utf8_to_unicode("€") << '\t';
 cout << utf8_to_unicode("🙂") << endl;
 
 cout << utf8_to_unicode("\x24") << '\t';
 cout << utf8_to_unicode("\xc2\xa2") << '\t';
 cout << utf8_to_unicode("\xe2\x82\xac") << '\t';
 cout << utf8_to_unicode("\xf0\x9f\x99\x82") << endl;
 
 return 0;
}


int utf8_to_unicode(string utf8_code)
{
 unsigned utf8_size = utf8_code.length();
 int unicode = 0;
 
 for (unsigned p=0; p<utf8_size; ++p)
 {
  int bit_count = (p? 6: 8 - utf8_size - (utf8_size == 1? 0: 1)),
      shift = (p < utf8_size - 1? (6*(utf8_size - p - 1)): 0);
  
  for (int k=0; k<bit_count; ++k)
   unicode += ((utf8_code[p] & (1 << k)) << shift);
 }
 
 return unicode;
}


string unicode_to_utf8(int unicode)
{
 string s;
 
 if (unicode>=0 and unicode <= 0x7f)  // 7F(16) = 127(10)
 {
  s = static_cast<char>(unicode);
  
  return s;
 }
 else if (unicode <= 0x7ff)  // 7FF(16) = 2047(10)
 {
  unsigned char c1 = 192, c2 = 128;
  
  for (int k=0; k<11; ++k)
  {
   if (k < 6)  c2 |= (unicode % 64) & (1 << k);
   else c1 |= (unicode >> 6) & (1 << (k - 6));
  }
  
  s = c1;    s += c2;

  return s;
 }
 else if (unicode <= 0xffff)  // FFFF(16) = 65535(10)
 {
  unsigned char c1 = 224, c2 = 128, c3 = 128;
  
  for (int k=0; k<16; ++k)
  {
   if (k < 6)  c3 |= (unicode % 64) & (1 << k);
   else if (k < 12) c2 |= (unicode >> 6) & (1 << (k - 6));
   else c1 |= (unicode >> 12) & (1 << (k - 12));
  }
  
  s = c1;    s += c2;    s += c3;

  return s;
 }
 else if (unicode <= 0x1fffff)  // 1FFFFF(16) = 2097151(10)
 {
  unsigned char c1 = 240, c2 = 128, c3 = 128, c4 = 128;
  
  for (int k=0; k<21; ++k)
  {
   if (k < 6)  c4 |= (unicode % 64) & (1 << k);
   else if (k < 12) c3 |= (unicode >> 6) & (1 << (k - 6));
   else if (k < 18) c2 |= (unicode >> 12) & (1 << (k - 12));
   else c1 |= (unicode >> 18) & (1 << (k - 18));
  }
  
  s = c1;    s += c2;    s += c3;    s += c4;

  return s;
 }
 else if (unicode <= 0x3ffffff)  // 3FFFFFF(16) = 67108863(10)
 {
  ;  // i tak nie ma jeszcze kodów 5-bajtowych
 }
 else if (unicode <= 0x7fffffff)  // 7FFFFFFF(16) = 2147483647(10)
 {
  ;  // i tak nie ma jeszcze kodów 6-bajtowych
 }
 else  ;  // błędny kod (< 0 albo > 2147483647)
 
 return "";
}


string unicode_table(int min, int max, string separator, int elements_in_row)
{
 string s;
 
 cout << string(min%elements_in_row, ' ');
 
 for (int k=min; k<=max; ++k)
  s += unicode_to_utf8(k)
        + ((k + 1) % elements_in_row > 0? separator: "\n");
 
 if ((max - min) % elements_in_row > 0)
  s += '\n';
 
 return s;
}
출처 - https://sites.google.com/view/technik-informatyk-nysa/porady/porady-c-cpp#h.p_u68nLns9HPff
천상

유니코드 To UTF-8

티스토리툴바