You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

236 lines
7.8 KiB

const WINDOWS_1252_EXTRA = {
0x80: "€", 0x82: "‚", 0x83: "ƒ", 0x84: "„", 0x85: "…", 0x86: "†",
0x87: "‡", 0x88: "ˆ", 0x89: "‰", 0x8a: "Š", 0x8b: "‹", 0x8c: "Œ",
0x8e: "Ž", 0x91: "‘", 0x92: "’", 0x93: "“", 0x94: "”", 0x95: "•",
0x96: "–", 0x97: "—", 0x98: "˜", 0x99: "™", 0x9a: "š", 0x9b: "›",
0x9c: "œ", 0x9e: "ž", 0x9f: "Ÿ",
};
const WINDOWS_1252_REVERSE = {};
for (const [code, char] of Object.entries(WINDOWS_1252_EXTRA)) {
WINDOWS_1252_REVERSE[char] = Number.parseInt(code, 10);
}
// ---------- Cached decoders/encoders ----------
let _utf8Decoder;
let _utf8Encoder;
function utf8Decoder() {
if (typeof globalThis.TextDecoder === "undefined")
return undefined;
return (_utf8Decoder !== null && _utf8Decoder !== void 0 ? _utf8Decoder : (_utf8Decoder = new globalThis.TextDecoder("utf-8")));
}
function utf8Encoder() {
if (typeof globalThis.TextEncoder === "undefined")
return undefined;
return (_utf8Encoder !== null && _utf8Encoder !== void 0 ? _utf8Encoder : (_utf8Encoder = new globalThis.TextEncoder()));
}
// Safe chunk size well under your measured ~105k cliff.
// 32k keeps memory reasonable and is plenty fast.
const CHUNK = 32 * 1024;
/**
* Decode text from binary data
* @param bytes Binary data
* @param encoding Encoding
*/
export function textDecode(bytes, encoding = "utf-8") {
switch (encoding.toLowerCase()) {
case "utf-8":
case "utf8": {
const dec = utf8Decoder();
return dec ? dec.decode(bytes) : decodeUTF8(bytes);
}
case "utf-16le":
return decodeUTF16LE(bytes);
case "us-ascii":
case "ascii":
return decodeASCII(bytes);
case "latin1":
case "iso-8859-1":
return decodeLatin1(bytes);
case "windows-1252":
return decodeWindows1252(bytes);
default:
throw new RangeError(`Encoding '${encoding}' not supported`);
}
}
export function textEncode(input = "", encoding = "utf-8") {
switch (encoding.toLowerCase()) {
case "utf-8":
case "utf8": {
const enc = utf8Encoder();
return enc ? enc.encode(input) : encodeUTF8(input);
}
case "utf-16le":
return encodeUTF16LE(input);
case "us-ascii":
case "ascii":
return encodeASCII(input);
case "latin1":
case "iso-8859-1":
return encodeLatin1(input);
case "windows-1252":
return encodeWindows1252(input);
default:
throw new RangeError(`Encoding '${encoding}' not supported`);
}
}
// --- Internal helpers ---
function decodeUTF8(bytes) {
const parts = [];
let out = "";
let i = 0;
while (i < bytes.length) {
const b1 = bytes[i++];
if (b1 < 0x80) {
out += String.fromCharCode(b1);
}
else if (b1 < 0xe0) {
const b2 = bytes[i++] & 0x3f;
out += String.fromCharCode(((b1 & 0x1f) << 6) | b2);
}
else if (b1 < 0xf0) {
const b2 = bytes[i++] & 0x3f;
const b3 = bytes[i++] & 0x3f;
out += String.fromCharCode(((b1 & 0x0f) << 12) | (b2 << 6) | b3);
}
else {
const b2 = bytes[i++] & 0x3f;
const b3 = bytes[i++] & 0x3f;
const b4 = bytes[i++] & 0x3f;
let cp = ((b1 & 0x07) << 18) | (b2 << 12) | (b3 << 6) | b4;
cp -= 0x10000;
out += String.fromCharCode(0xd800 + ((cp >> 10) & 0x3ff), 0xdc00 + (cp & 0x3ff));
}
if (out.length >= CHUNK) {
parts.push(out);
out = "";
}
}
if (out)
parts.push(out);
return parts.join("");
}
function decodeUTF16LE(bytes) {
// Use chunked fromCharCode on 16-bit code units.
// If odd length, ignore trailing byte (common behavior).
const len = bytes.length & ~1;
if (len === 0)
return "";
const parts = [];
// Build a temporary code-unit array per chunk.
const maxUnits = CHUNK; // CHUNK code units per chunk
for (let i = 0; i < len;) {
const unitsThis = Math.min(maxUnits, (len - i) >> 1);
const units = new Array(unitsThis);
for (let j = 0; j < unitsThis; j++, i += 2) {
units[j] = bytes[i] | (bytes[i + 1] << 8);
}
parts.push(String.fromCharCode.apply(null, units));
}
return parts.join("");
}
function decodeASCII(bytes) {
// 7-bit ASCII: mask high bit. (Kept to match your original semantics.)
const parts = [];
for (let i = 0; i < bytes.length; i += CHUNK) {
const end = Math.min(bytes.length, i + CHUNK);
const codes = new Array(end - i);
for (let j = i, k = 0; j < end; j++, k++) {
codes[k] = bytes[j] & 0x7f;
}
parts.push(String.fromCharCode.apply(null, codes));
}
return parts.join("");
}
function decodeLatin1(bytes) {
// Latin-1 is 0x00..0xFF direct mapping; avoid spread.
const parts = [];
for (let i = 0; i < bytes.length; i += CHUNK) {
const end = Math.min(bytes.length, i + CHUNK);
const codes = new Array(end - i);
for (let j = i, k = 0; j < end; j++, k++) {
codes[k] = bytes[j];
}
parts.push(String.fromCharCode.apply(null, codes));
}
return parts.join("");
}
function decodeWindows1252(bytes) {
// Only 0x80..0x9F need mapping; others are direct 1-byte codes.
const parts = [];
let out = "";
for (let i = 0; i < bytes.length; i++) {
const b = bytes[i];
const extra = b >= 0x80 && b <= 0x9f ? WINDOWS_1252_EXTRA[b] : undefined;
out += extra !== null && extra !== void 0 ? extra : String.fromCharCode(b);
if (out.length >= CHUNK) {
parts.push(out);
out = "";
}
}
if (out)
parts.push(out);
return parts.join("");
}
function encodeUTF8(str) {
const out = [];
for (let i = 0; i < str.length; i++) {
let cp = str.charCodeAt(i);
// surrogate pair
if (cp >= 0xd800 && cp <= 0xdbff && i + 1 < str.length) {
const lo = str.charCodeAt(i + 1);
if (lo >= 0xdc00 && lo <= 0xdfff) {
cp = 0x10000 + ((cp - 0xd800) << 10) + (lo - 0xdc00);
i++;
}
}
if (cp < 0x80) {
out.push(cp);
}
else if (cp < 0x800) {
out.push(0xc0 | (cp >> 6), 0x80 | (cp & 0x3f));
}
else if (cp < 0x10000) {
out.push(0xe0 | (cp >> 12), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
}
else {
out.push(0xf0 | (cp >> 18), 0x80 | ((cp >> 12) & 0x3f), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
}
}
return new Uint8Array(out);
}
function encodeUTF16LE(str) {
const out = new Uint8Array(str.length * 2);
for (let i = 0; i < str.length; i++) {
const code = str.charCodeAt(i);
const o = i * 2;
out[o] = code & 0xff;
out[o + 1] = code >>> 8;
}
return out;
}
function encodeASCII(str) {
// 7-bit ASCII: mask high bit
const out = new Uint8Array(str.length);
for (let i = 0; i < str.length; i++)
out[i] = str.charCodeAt(i) & 0x7f;
return out;
}
function encodeLatin1(str) {
const out = new Uint8Array(str.length);
for (let i = 0; i < str.length; i++)
out[i] = str.charCodeAt(i) & 0xff;
return out;
}
function encodeWindows1252(str) {
const out = new Uint8Array(str.length);
for (let i = 0; i < str.length; i++) {
const ch = str[i];
const code = ch.charCodeAt(0);
if (code <= 0xff) {
out[i] = code;
continue;
}
const mapped = WINDOWS_1252_REVERSE[ch];
out[i] = mapped !== undefined ? mapped : 0x3f; // '?'
}
return out;
}