tek*_*ues 4 html delphi encoding
我正在使用Delphi 2009并想要解码HTML编码的字符串,例如:
' -> '
Run Code Online (Sandbox Code Playgroud)
但是找不到任何内置功能.
提前致谢
HttpApp.HttpDecode 函数不解码 http 实体 ( https://www.w3.org/TR/html4/sgml/entities.html#sym )
例如:∴→∴
function HtmlDecode(s: UnicodeString): UnicodeString;
{
Public domain: No attribution required
Known issue, it doesn't handle entities with characters code points above $FFFF (65536)
e.g.: &;
That's because UTF-16 requires 2 characters to encode one character.
}
function UCS4CharToString(uch: UCS4Char): UnicodeString;
var
s: UCS4String;
begin
SetLength(s, 2);
s[0] := uch;
s[1] := 0; //null terminator
Result := UCS4StringToUnicodeString(s);
end;
function GetCharRef(sValue: UnicodeString; StartIndex: Integer; out CharRef: string): UnicodeString;
var
i: Integer;
len: Integer;
nChar: UCS4Char;
begin
{
Character references come in either decimal or hex forms:
♦ //decimal
♦ //hexidecimal
As per the definition:
CharRef ::= '&#' [0-9]+ ';'
|
'&#x' [0-9a-fA-F]+ ';'
}
Result := '';
CharRef := '';
len := Length(sValue) - StartIndex + 1;
if len < 4 then
Exit;
i := StartIndex;
if sValue[i] <> '&' then Exit;
Inc(i);
if sValue[i] <> '#' then Exit;
Inc(i);
if sValue[i] = 'x' then
begin
{
Hex character reference
CharRef ::= '&#x' [0-9a-fA-F]+ ';'
E.g. ♦
}
Inc(i); //skip the x
while CharInSet(sValue[i], ['0'..'9', 'a'..'f', 'A'..'F']) do
begin
Inc(i);
if i > Length(sValue) then
Exit;
end;
if sValue[i] <> ';' then
Exit;
charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
nChar := StrToInt('$'+Copy(charRef, 4, Length(charRef)-4));
end
else
begin
{
Decimal character reference
CharRef ::= '&#' [0-9]+ ';'
E.g. ♦
}
while CharInSet(sValue[i], ['0'..'9']) do
begin
Inc(i);
if i > Length(sValue) then
Exit;
end;
if sValue[i] <> ';' then
Exit;
charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
nChar := StrToInt(Copy(charRef, 3, Length(charRef)-3));
end;
Result := UCS4CharToString(nChar);
end;
function GetEntityRef(sValue: string; StartIndex: Integer; out CharRef: string): UnicodeString;
function IsNameStartChar(ch: WideChar): Boolean;
begin
{
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
}
Result := False;
case ch of
':', 'A'..'Z', '_', 'a'..'z', #$C0..#$D6, #$D8..#$F6, #$F8..#$FF: Result := True;
#$100..#$2FF, #$370..#$37D, #$37F..#$FFF: Result := True;
#$1000..#$1FFF, #$200C..#$200D, #$2070..#$218F, #$2C00..#$2FEF, #$3001..#$D7FF, #$F900..#$FDCF, #$FDF0..#$FFFD: Result := True;
else
//We assume strings are UTF-16. But by assuming one 16-bit word is the same as one character is just wrong.
//UTF-16, like UTF-8 can be multi-byte.
//But it's just so haaaard to support.
//The correct action is to convert the string to UCS4, where one code-point is always one character.
case Integer(ch) of
$10000..$EFFFF: Result := True;
end;
end;
end;
function IsNameChar(ch: WideChar): Boolean;
begin
if IsNameStartChar(ch) then
begin
Result := True;
Exit;
end;
case ch of
'-', '.', '0'..'9', #$B7, #$0300..#$036F, #$203F..#$2040: Result := True;
else
Result := False;
end;
end;
type
THtmlEntity = record
entity: string;
ch: UCS4Char;
end;
const
//https://www.w3.org/TR/html4/sgml/entities.html#sym
//html entities are case sensitive (e.g. "larr" is different from "lArr")
HtmlEntities: array[0..252] of THtmlEntity = (
(entity: 'apos'; ch: 39; ), // apostrophe (originally only existed in xml, and not in HTML. Was added to HTML5
(entity: 'quot'; ch: 34; ), // quotation mark = APL quote, U+0022
(entity: 'amp'; ch: 38; ), // ampersand, U+0026
(entity: 'lt'; ch: 60; ), // less-than sign, U+003C
(entity: 'gt'; ch: 62; ), // greater-than sign, U+003E
(entity: 'OElig'; ch: 338; ), // latin capital ligature OE, U+0152
(entity: 'oelig'; ch: 339; ), // latin small ligature oe, U+0153
(entity: 'Scaron'; ch: 352; ), // latin capital letter S with caron, U+0160
(entity: 'scaron'; ch: 353; ), // latin small letter s with caron, U+0161
(entity: 'Yuml'; ch: 376; ), // latin capital letter Y with diaeresis, U+0178
(entity: 'circ'; ch: 710; ), // modifier letter circumflex accent, U+02C6
(entity: 'tilde'; ch: 732; ), // small tilde, U+02DC
(entity: 'nbsp'; ch: 160; ), // no-break space = non-breaking space, U+00A0
(entity: 'iexcl'; ch: 161; ), // inverted exclamation mark, U+00A1
(entity: 'cent'; ch: 162; ), // cent sign, U+00A2
(entity: 'pound'; ch: 163; ), // pound sign, U+00A3
(entity: 'curren'; ch: 164; ), // currency sign, U+00A4
(entity: 'yen'; ch: 165; ), // yen sign = yuan sign, U+00A5
(entity: 'brvbar'; ch: 166; ), // broken bar = broken vertical bar, U+00A6
(entity: 'sect'; ch: 167; ), // section sign, U+00A7
(entity: 'uml'; ch: 168; ), // diaeresis = spacing diaeresis, U+00A8
(entity: 'copy'; ch: 169; ), // copyright sign, U+00A9
(entity: 'ordf'; ch: 170; ), // feminine ordinal indicator, U+00AA
(entity: 'laquo'; ch: 171; ), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB
(entity: 'not'; ch: 172; ), // not sign, U+00AC
(entity: 'shy'; ch: 173; ), // soft hyphen = discretionary hyphen, U+00AD
(entity: 'reg'; ch: 174; ), // registered sign = registered trade mark sign, U+00AE
(entity: 'macr'; ch: 175; ), // macron = spacing macron = overline = APL overbar, U+00AF
(entity: 'deg'; ch: 176; ), // degree sign, U+00B0
(entity: 'plusmn'; ch: 177; ), // plus-minus sign = plus-or-minus sign, U+00B1
(entity: 'sup2'; ch: 178; ), // superscript two = superscript digit two = squared, U+00B2
(entity: 'sup3'; ch: 179; ), // superscript three = superscript digit three = cubed, U+00B3
(entity: 'acute'; ch: 180; ), // acute accent = spacing acute, U+00B4
(entity: 'micro'; ch: 181; ), // micro sign, U+00B5
(entity: 'para'; ch: 182; ), // pilcrow sign = paragraph sign, U+00B6
(entity: 'middot'; ch: 183; ), // middle dot = Georgian comma = Greek middle dot, U+00B7
(entity: 'cedil'; ch: 184; ), // cedilla = spacing cedilla, U+00B8
(entity: 'sup1'; ch: 185; ), // superscript one = superscript digit one, U+00B9
(entity: 'ordm'; ch: 186; ), // masculine ordinal indicator, U+00BA
(entity: 'raquo'; ch: 187; ), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB
(entity: 'frac14'; ch: 188; ), // vulgar fraction one quarter = fraction one quarter, U+00BC
(entity: 'frac12'; ch: 189; ), // vulgar fraction one half = fraction one half, U+00BD
(entity: 'frac34'; ch: 190; ), // vulgar fraction three quarters = fraction three quarters, U+00BE
(entity: 'iquest'; ch: 191; ), // inverted question mark = turned question mark, U+00BF
(entity: 'Agrave'; ch: 192; ), // latin capital letter A with grave = latin capital letter A grave, U+00C0
(entity: 'Aacute'; ch: 193; ), // latin capital letter A with acute, U+00C1
(entity: 'Acirc'; ch: 194; ), // latin capital letter A with circumflex, U+00C2
(entity: 'Atilde'; ch: 195; ), // latin capital letter A with tilde, U+00C3
(entity: 'Auml'; ch: 196; ), // latin capital letter A with diaeresis, U+00C4
(entity: 'Aring'; ch: 197; ), // latin capital letter A with ring above = latin capital letter A ring, U+00C5
(entity: 'AElig'; ch: 198; ), // latin capital letter AE = latin capital ligature AE, U+00C6
(entity: 'Ccedil'; ch: 199; ), // latin capital letter C with cedilla, U+00C7
(entity: 'Egrave'; ch: 200; ), // latin capital letter E with grave, U+00C8
(entity: 'Eacute'; ch: 201; ), // latin capital letter E with acute, U+00C9
(entity: 'Ecirc'; ch: 202; ), // latin capital letter E with circumflex, U+00CA
(entity: 'Euml'; ch: 203; ), // latin capital letter E with diaeresis, U+00CB
(entity: 'Igrave'; ch: 204; ), // latin capital letter I with grave, U+00CC
(entity: 'Iacute'; ch: 205; ), // latin capital letter I with acute, U+00CD
(entity: 'Icirc'; ch: 206; ), // latin capital letter I with circumflex, U+00CE
(entity: 'Iuml'; ch: 207; ), // latin capital letter I with diaeresis, U+00CF
(entity: 'ETH'; ch: 208; ), // latin capital letter ETH, U+00D0
(entity: 'Ntilde'; ch: 209; ), // latin capital letter N with tilde, U+00D1
(entity: 'Ograve'; ch: 210; ), // latin capital letter O with grave, U+00D2
(entity: 'Oacute'; ch: 211; ), // latin capital letter O with acute, U+00D3
(entity: 'Ocirc'; ch: 212; ), // latin capital letter O with circumflex, U+00D4
(entity: 'Otilde'; ch: 213; ), // latin capital letter O with tilde, U+00D5
(entity: 'Ouml'; ch: 214; ), // latin capital letter O with diaeresis, U+00D6
(entity: 'times'; ch: 215; ), // multiplication sign, U+00D7
(entity: 'Oslash'; ch: 216; ), // latin capital letter O with stroke = latin capital letter O slash, U+00D8
(entity: 'Ugrave'; ch: 217; ), // latin capital letter U with grave, U+00D9
(entity: 'Uacute'; ch: 218; ), // latin capital letter U with acute, U+00DA
(entity: 'Ucirc'; ch: 219; ), // latin capital letter U with circumflex, U+00DB
(entity: 'Uuml'; ch: 220; ), // latin capital letter U with diaeresis, U+00DC
(entity: 'Yacute'; ch: 221; ), // latin capital letter Y with acute, U+00DD
(entity: 'THORN'; ch: 222; ), // latin capital letter THORN, U+00DE
(entity: 'szlig'; ch: 223; ), // latin small letter sharp s = ess-zed, U+00DF
(entity: 'agrave'; ch: 224; ), // latin small letter a with grave = latin small letter a grave, U+00E0
(entity: 'aacute'; ch: 225; ), // latin small letter a with acute, U+00E1
(entity: 'acirc'; ch: 226; ), // latin small letter a with circumflex, U+00E2
(entity: 'atilde'; ch: 227; ), // latin small letter a with tilde, U+00E3
(entity: 'auml'; ch: 228; ), // latin small letter a with diaeresis, U+00E4
(entity: 'aring'; ch: 229; ), // latin small letter a with ring above = latin small letter a ring, U+00E5
(entity: 'aelig'; ch: 230; ), // latin small letter ae = latin small ligature ae, U+00E6
(entity: 'ccedil'; ch: 231; ), // latin small letter c with cedilla, U+00E7
(entity: 'egrave'; ch: 232; ), // latin small letter e with grave, U+00E8
(entity: 'eacute'; ch: 233; ), // latin small letter e with acute, U+00E9
(entity: 'ecirc'; ch: 234; ), // latin small letter e with circumflex, U+00EA
(entity: 'euml'; ch: 235; ), // latin small letter e with diaeresis, U+00EB
(entity: 'igrave'; ch: 236; ), // latin small letter i with grave, U+00EC
(entity: 'iacute'; ch: 237; ), // latin small letter i with acute, U+00ED
(entity: 'icirc'; ch: 238; ), // latin small letter i with circumflex, U+00EE
(entity: 'iuml'; ch: 239; ), // latin small letter i with diaeresis, U+00EF
(entity: 'eth'; ch: 240; ), // latin small letter eth, U+00F0
(entity: 'ntilde'; ch: 241; ), // latin small letter n with tilde, U+00F1
(entity: 'ograve'; ch: 242; ), // latin small letter o with grave, U+00F2
(entity: 'oacute'; ch: 243; ), // latin small letter o with acute, U+00F3
(entity: 'ocirc'; ch: 244; ), // latin small letter o with circumflex, U+00F4
(entity: 'otilde'; ch: 245; ), // latin small letter o with tilde, U+00F5
(entity: 'ouml'; ch: 246; ), // latin small letter o with diaeresis, U+00F6
(entity: 'divide'; ch: 247; ), // division sign, U+00F7
(entity: 'oslash'; ch: 248; ), // latin small letter o with stroke, = latin small letter o slash, U+00F8
(entity: 'ugrave'; ch: 249; ), // latin small letter u with grave, U+00F9
(entity: 'uacute'; ch: 250; ), // latin small letter u with acute, U+00FA
(entity: 'ucirc'; ch: 251; ), // latin small letter u with circumflex, U+00FB
(entity: 'uuml'; ch: 252; ), // latin small letter u with diaeresis, U+00FC
(entity: 'yacute'; ch: 253; ), // latin small letter y with acute, U+00FD
(entity: 'thorn'; ch: 254; ), // latin small letter thorn, U+00FE
(entity: 'yuml'; ch: 255; ), // latin small letter y with diaeresis, U+00FF
(entity: 'fnof'; ch: 402; ), // latin small f with hook = function = florin, U+0192
(entity: 'Alpha'; ch: 913; ), // greek capital letter alpha, U+0391
(entity: 'Beta'; ch: 914; ), // greek capital letter beta, U+0392
(entity: 'Gamma'; ch: 915; ), // greek capital letter gamma, U+0393
(entity: 'Delta'; ch: 916; ), // greek capital letter delta, U+0394
(entity: 'Epsilon'; ch: 917; ), // greek capital letter epsilon, U+0395
(entity: 'Zeta'; ch: 918; ), // greek capital letter zeta, U+0396
(entity: 'Eta'; ch: 919; ), // greek capital letter eta, U+0397
(entity: 'Theta'; ch: 920; ), // greek capital letter theta, U+0398
(entity: 'Iota'; ch: 921; ), // greek capital letter iota, U+0399
(entity: 'Kappa'; ch: 922; ), // greek capital letter kappa, U+039A
(entity: 'Lambda'; ch: 923; ), // greek capital letter lambda, U+039B
(entity: 'Mu'; ch: 924; ), // greek capital letter mu, U+039C
(entity: 'Nu'; ch: 925; ), // greek capital letter nu, U+039D
(entity: 'Xi'; ch: 926; ), // greek capital letter xi, U+039E
(entity: 'Omicron'; ch: 927; ), // greek capital letter omicron, U+039F
(entity: 'Pi'; ch: 928; ), // greek capital letter pi, U+03A0
(entity: 'Rho'; ch: 929; ), // greek capital letter rho, U+03A1
// there is no Sigmaf, and no U+03A2 character either
(entity: 'Sigma'; ch: 931; ), // greek capital letter sigma, U+03A3
(entity: 'Tau'; ch: 932; ), // greek capital letter tau, U+03A4
(entity: 'Upsilon'; ch: 933; ), // greek capital letter upsilon, U+03A5
(entity: 'Phi'; ch: 934; ), // greek capital letter phi, U+03A6
(entity: 'Chi'; ch: 935; ), // greek capital letter chi, U+03A7
(entity: 'Psi'; ch: 936; ), // greek capital letter psi, U+03A8
(entity: 'Omega'; ch: 937; ), // greek capital letter omega, U+03A9
(entity: 'alpha'; ch: 945; ), // greek small letter alpha, U+03B1
(entity: 'beta'; ch: 946; ), // greek small letter beta, U+03B2
(entity: 'gamma'; ch: 947; ), // greek small letter gamma, U+03B3
(entity: 'delta'; ch: 948; ), // greek small letter delta, U+03B4
(entity: 'epsilon'; ch: 949; ), // greek small letter epsilon, U+03B5
(entity: 'zeta'; ch: 950; ), // greek small letter zeta, U+03B6
(entity: 'eta'; ch: 951; ), // greek small letter eta, U+03B7
(entity: 'theta'; ch: 952; ), // greek small letter theta, U+03B8
(entity: 'iota'; ch: 953; ), // greek small letter iota, U+03B9
(entity: 'kappa'; ch: 954; ), // greek small letter kappa, U+03BA
(entity: 'lambda'; ch: 955; ), // greek small letter lambda, U+03BB
(entity: 'mu'; ch: 956; ), // greek small letter mu, U+03BC
(entity: 'nu'; ch: 957; ), // greek small letter nu, U+03BD
(entity: 'xi'; ch: 958; ), // greek small letter xi, U+03BE
(entity: 'omicron'; ch: 959; ), // greek small letter omicron, U+03BF NEW
(entity: 'pi'; ch: 960; ), // greek small letter pi, U+03C0
(entity: 'rho'; ch: 961; ), // greek small letter rho, U+03C1
小智 5
这是我的HTMLDecode程序(从CGs HTTPApp单元略微修改):
function HTMLDecode(const AStr: String): String;
var
Sp, Rp, Cp, Tp: PChar;
S: String;
I, Code: Integer;
begin
SetLength(Result, Length(AStr));
Sp := PChar(AStr);
Rp := PChar(Result);
Cp := Sp;
try
while Sp^ <> #0 do
begin
case Sp^ of
'&': begin
Cp := Sp;
Inc(Sp);
case Sp^ of
'a': if AnsiStrPos(Sp, 'amp;') = Sp then { do not localize }
begin
Inc(Sp, 3);
Rp^ := '&';
end;
'l',
'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize }
begin
Cp := Sp;
Inc(Sp, 2);
while (Sp^ <> ';') and (Sp^ <> #0) do
Inc(Sp);
if Cp^ = 'l' then
Rp^ := '<'
else
Rp^ := '>';
end;
'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then { do not localize }
begin
Inc(Sp, 4);
Rp^ := ' ';
end;
'q': if AnsiStrPos(Sp, 'quot;') = Sp then { do not localize }
begin
Inc(Sp,4);
Rp^ := '"';
end;
'#': begin
Tp := Sp;
Inc(Tp);
while (Sp^ <> ';') and (Sp^ <> #0) do
Inc(Sp);
SetString(S, Tp, Sp - Tp);
Val(S, I, Code);
Rp^ := Chr((I));
end;
else
Exit;
end;
end
else
Rp^ := Sp^;
end;
Inc(Rp);
Inc(Sp);
end;
except
end;
SetLength(Result, Rp - PChar(Result));
end;
Run Code Online (Sandbox Code Playgroud)