Bootstrap

C++对中文字符的处理

前言

1.C++的string对中文的查找替换之类的基本操作并不友好,如果要对中文进行操作,要把中文转成宽字符(wstring)来解决,因为中文字符长度不确定的,在unicode中每个中文为2个字节,而字符串中有时还可能有英文数字字符等,这些只占一个字节1个字节,查找的过程很容易返回的不是找到的位置。
2.如果要操作中文字符串,比较好的办法先把string转成wstring,进行操作查找匹配操作之后,再转回来。
3.这里我定了个类,把它们之间的互相转换都封装成函数。

代码

Chinese.h

#pragma once
#include <string>
#include <iostream>

class Chinese
{
public:
	Chinese();
	~Chinese();

	//char*转换为wchar_t*
	wchar_t* MBCSToUnicode(wchar_t * buff, const char * str);
	//wchar*转换为char*
	char* unicodeToMBCS(char* buff, const wchar_t* str);
	//string转wstring
	std::wstring strToWstr(std::string &input);
	std::string wstrToStr(std::wstring &wstr);
	
	char* wstrToChar(std::wstring &wstr);
	char* wstrToChar(const wchar_t* wstr);
};

Chinese.cpp

#include "Chinese.h"

Chinese::Chinese()
{

}

Chinese::~Chinese()
{
}

wchar_t* Chinese::MBCSToUnicode(wchar_t* buff, const char* str)
{
	wchar_t * wp = buff;
	char * p = (char *)str;
	while (*p)
	{
		if (*p & 0x80)
		{
			*wp = *(wchar_t *)p;
			p++;
		}
		else {
			*wp = (wchar_t)*p;
		}
		wp++;
		p++;
	}
	*wp = 0x0000;
	return buff;
}

char* Chinese::unicodeToMBCS(char* buff, const wchar_t* str)
{
	wchar_t * wp = (wchar_t *)str;
	char * p = buff, *tmp;
	while (*wp)
	{
		tmp = (char *)wp;
		if (*wp & 0xFF00)
		{
			*p = *tmp;
			p++; tmp++;
			*p = *tmp;
			p++;
		}
		else
		{
			*p = *tmp;
			p++;
		}
		wp++;
	}
	*p = 0x00;
	return buff;
}

std::wstring Chinese::strToWstr(std::string &input)
{
	size_t len = input.size();
	wchar_t * b = (wchar_t *)malloc((len + 1) * sizeof(wchar_t));
	MBCSToUnicode(b, input.c_str());
	std::wstring r(b);
	free(b);
	return r;
}

char* Chinese::wstrToChar(std::wstring &wstr)
{
	char* re = wstrToChar(wstr.c_str());
	return re;
}

char* Chinese::wstrToChar(const wchar_t* wstr)
{
	int len = wcslen(wstr);
	char * buff = (char *)malloc((len * 2 + 1) * sizeof(char));
	char* re = unicodeToMBCS(buff, wstr);
	free(buff);
	return re;
}

std::string Chinese::wstrToStr(std::wstring &wstr)
{
	size_t len = wstr.size();
	char * b = (char *)malloc((2 * len + 1) * sizeof(char));
	unicodeToMBCS(b, wstr.c_str());
	std::string r(b);
	free(b);
	return r;
}

main.cpp

#include <iostream>
#include <string>
#include "Chinese.h"

int main()
{
	//输入层:接收char*输入,并将其转换为wchar*
	std::string input = "于老师的k父亲王老爷子是蒙古的海军司令!yes";
	std::string temp = "王";
	
	Chinese ch;
	std::wstring w_str = ch.strToWstr(input);
	std::wstring w_tem = ch.strToWstr(temp);
	int index = w_str.find(w_tem);
	std::cout << index << std::endl;
	
	return 0;
}
;