1fbb725bbSAdrien Destugues/*
2fbb725bbSAdrien Destugues * Copyright 2016, Haiku, inc.
3fbb725bbSAdrien Destugues * Distributed under terms of the MIT license.
4fbb725bbSAdrien Destugues */
5fbb725bbSAdrien Destugues
6fbb725bbSAdrien Destugues
7fbb725bbSAdrien Destugues#include "TextEncoding.h"
8fbb725bbSAdrien Destugues
908e52491SAdrien Destugues#include <unicode/ucnv.h>
10fbb725bbSAdrien Destugues#include <unicode/ucsdet.h>
11fbb725bbSAdrien Destugues
1208e52491SAdrien Destugues#include <algorithm>
1308e52491SAdrien Destugues
1408e52491SAdrien Destugues
15a71b10d7SAdrien Destuguesnamespace BPrivate {
16a71b10d7SAdrien Destugues
17a71b10d7SAdrien Destugues
18a71b10d7SAdrien DestuguesBTextEncoding::BTextEncoding(BString name)
1908e52491SAdrien Destugues	:
2008e52491SAdrien Destugues	fName(name),
2108e52491SAdrien Destugues	fUtf8Converter(NULL),
2208e52491SAdrien Destugues	fConverter(NULL)
2308e52491SAdrien Destugues{
2408e52491SAdrien Destugues}
2508e52491SAdrien Destugues
26fbb725bbSAdrien Destugues
27a71b10d7SAdrien DestuguesBTextEncoding::BTextEncoding(const char* data, size_t length)
2808e52491SAdrien Destugues	:
2908e52491SAdrien Destugues	fUtf8Converter(NULL),
3008e52491SAdrien Destugues	fConverter(NULL)
31fbb725bbSAdrien Destugues{
32fbb725bbSAdrien Destugues	UErrorCode error = U_ZERO_ERROR;
33fbb725bbSAdrien Destugues
34fbb725bbSAdrien Destugues	UCharsetDetector* detector = ucsdet_open(&error);
35fbb725bbSAdrien Destugues	ucsdet_setText(detector, data, length, &error);
36fbb725bbSAdrien Destugues	const UCharsetMatch* encoding = ucsdet_detect(detector, &error);
37fbb725bbSAdrien Destugues
38fbb725bbSAdrien Destugues	fName = ucsdet_getName(encoding, &error);
39fbb725bbSAdrien Destugues	ucsdet_close(detector);
40fbb725bbSAdrien Destugues}
41fbb725bbSAdrien Destugues
42fbb725bbSAdrien Destugues
43a71b10d7SAdrien DestuguesBTextEncoding::~BTextEncoding()
4408e52491SAdrien Destugues{
4508e52491SAdrien Destugues	if (fUtf8Converter != NULL)
4608e52491SAdrien Destugues		ucnv_close(fUtf8Converter);
4708e52491SAdrien Destugues
4808e52491SAdrien Destugues	if (fConverter != NULL)
4908e52491SAdrien Destugues		ucnv_close(fConverter);
5008e52491SAdrien Destugues}
5108e52491SAdrien Destugues
5208e52491SAdrien Destugues
5308e52491SAdrien Destuguesstatus_t
54a71b10d7SAdrien DestuguesBTextEncoding::InitCheck()
5508e52491SAdrien Destugues{
5608e52491SAdrien Destugues	if (fName.IsEmpty())
5708e52491SAdrien Destugues		return B_NO_INIT;
5808e52491SAdrien Destugues	else
5908e52491SAdrien Destugues		return B_OK;
6008e52491SAdrien Destugues}
6108e52491SAdrien Destugues
6208e52491SAdrien Destugues
6308e52491SAdrien Destuguesstatus_t
64a71b10d7SAdrien DestuguesBTextEncoding::Decode(const char* input, size_t& inputLength, char* output,
6508e52491SAdrien Destugues	size_t& outputLength)
6608e52491SAdrien Destugues{
6708e52491SAdrien Destugues	const char* base = input;
6808e52491SAdrien Destugues	char* target = output;
6908e52491SAdrien Destugues
7008e52491SAdrien Destugues	// Optimize the easy case.
7108e52491SAdrien Destugues	// Note: we don't check the input to be valid UTF-8 when doing that.
7208e52491SAdrien Destugues	if (fName == "UTF-8") {
7308e52491SAdrien Destugues		outputLength = std::min(inputLength, outputLength);
7408e52491SAdrien Destugues		inputLength = outputLength;
7508e52491SAdrien Destugues		memcpy(output, input, inputLength);
7608e52491SAdrien Destugues		return B_OK;
7708e52491SAdrien Destugues	}
7808e52491SAdrien Destugues
7908e52491SAdrien Destugues	UErrorCode error = U_ZERO_ERROR;
8008e52491SAdrien Destugues
8108e52491SAdrien Destugues	if (fUtf8Converter == NULL)
8208e52491SAdrien Destugues		fUtf8Converter = ucnv_open("UTF-8", &error);
8308e52491SAdrien Destugues
8408e52491SAdrien Destugues	if (fConverter == NULL)
8508e52491SAdrien Destugues		fConverter = ucnv_open(fName.String(), &error);
8608e52491SAdrien Destugues
8708e52491SAdrien Destugues	ucnv_convertEx(fUtf8Converter, fConverter, &target, output + outputLength,
8808e52491SAdrien Destugues		&base, input + inputLength, NULL, NULL, NULL, NULL, FALSE, TRUE,
8908e52491SAdrien Destugues		&error);
9008e52491SAdrien Destugues
9108e52491SAdrien Destugues	// inputLength is set to the number of bytes consumed. We may not use all of
9208e52491SAdrien Destugues	// the input data (for example if it is cut in the middle of an utf-8 char).
9308e52491SAdrien Destugues	inputLength = base - input;
9408e52491SAdrien Destugues	outputLength = target - output;
9508e52491SAdrien Destugues
9608e52491SAdrien Destugues	if (!U_SUCCESS(error))
9708e52491SAdrien Destugues		return B_ERROR;
9808e52491SAdrien Destugues
9908e52491SAdrien Destugues	return B_OK;
10008e52491SAdrien Destugues}
10108e52491SAdrien Destugues
10208e52491SAdrien Destugues
10308e52491SAdrien Destuguesstatus_t
104a71b10d7SAdrien DestuguesBTextEncoding::Encode(const char* input, size_t& inputLength, char* output,
10508e52491SAdrien Destugues	size_t& outputLength)
10608e52491SAdrien Destugues{
10708e52491SAdrien Destugues	const char* base = input;
10808e52491SAdrien Destugues	char* target = output;
10908e52491SAdrien Destugues
11008e52491SAdrien Destugues	// Optimize the easy case.
11108e52491SAdrien Destugues	// Note: we don't check the input to be valid UTF-8 when doing that.
11208e52491SAdrien Destugues	if (fName == "UTF-8") {
11308e52491SAdrien Destugues		outputLength = std::min(inputLength, outputLength);
11408e52491SAdrien Destugues		inputLength = outputLength;
11508e52491SAdrien Destugues		memcpy(output, input, inputLength);
11608e52491SAdrien Destugues		return B_OK;
11708e52491SAdrien Destugues	}
11808e52491SAdrien Destugues
11908e52491SAdrien Destugues	UErrorCode error = U_ZERO_ERROR;
12008e52491SAdrien Destugues
12108e52491SAdrien Destugues	if (fUtf8Converter == NULL)
12208e52491SAdrien Destugues		fUtf8Converter = ucnv_open("UTF-8", &error);
12308e52491SAdrien Destugues
12408e52491SAdrien Destugues	if (fConverter == NULL)
12508e52491SAdrien Destugues		fConverter = ucnv_open(fName.String(), &error);
12608e52491SAdrien Destugues
12708e52491SAdrien Destugues	ucnv_convertEx(fConverter, fUtf8Converter, &target, output + outputLength,
12808e52491SAdrien Destugues		&base, input + inputLength, NULL, NULL, NULL, NULL, FALSE, TRUE,
12908e52491SAdrien Destugues		&error);
13008e52491SAdrien Destugues
13108e52491SAdrien Destugues	// inputLength is set to the number of bytes consumed. We may not use all of
13208e52491SAdrien Destugues	// the input data (for example if it is cut in the middle of an utf-8 char).
13308e52491SAdrien Destugues	inputLength = base - input;
13408e52491SAdrien Destugues	outputLength = target - output;
13508e52491SAdrien Destugues
13608e52491SAdrien Destugues	if (!U_SUCCESS(error))
13708e52491SAdrien Destugues		return B_ERROR;
13808e52491SAdrien Destugues
13908e52491SAdrien Destugues	return B_OK;
14008e52491SAdrien Destugues}
14108e52491SAdrien Destugues
14208e52491SAdrien Destugues
14308e52491SAdrien Destuguesstatus_t
144a71b10d7SAdrien DestuguesBTextEncoding::Flush(char* output, size_t& outputLength)
14508e52491SAdrien Destugues{
14608e52491SAdrien Destugues	char* target = output;
14708e52491SAdrien Destugues
14808e52491SAdrien Destugues	if (fName == "UTF-8")
14908e52491SAdrien Destugues		return B_OK;
15008e52491SAdrien Destugues
15108e52491SAdrien Destugues	if (fUtf8Converter == NULL || fConverter == NULL)
15208e52491SAdrien Destugues		return B_NO_INIT;
15308e52491SAdrien Destugues
15408e52491SAdrien Destugues	UErrorCode error = U_ZERO_ERROR;
15508e52491SAdrien Destugues
15608e52491SAdrien Destugues	ucnv_convertEx(fConverter, fUtf8Converter, &target, output + outputLength,
15708e52491SAdrien Destugues		NULL, NULL, NULL, NULL, NULL, NULL, FALSE, TRUE,
15808e52491SAdrien Destugues		&error);
15908e52491SAdrien Destugues
16008e52491SAdrien Destugues	if (!U_SUCCESS(error))
16108e52491SAdrien Destugues		return B_ERROR;
16208e52491SAdrien Destugues
16308e52491SAdrien Destugues	return B_OK;
16408e52491SAdrien Destugues}
16508e52491SAdrien Destugues
16608e52491SAdrien Destugues
167fbb725bbSAdrien DestuguesBString
168a71b10d7SAdrien DestuguesBTextEncoding::GetName()
169fbb725bbSAdrien Destugues{
170fbb725bbSAdrien Destugues	return fName;
171fbb725bbSAdrien Destugues}
172a71b10d7SAdrien Destugues
173a71b10d7SAdrien Destugues
174a71b10d7SAdrien Destugues};
175