delphi html解码

tek*_*ues 4 html delphi encoding

我正在使用Delphi 2009并想要解码HTML编码的字符串,例如:

' -> '
Run Code Online (Sandbox Code Playgroud)

但是找不到任何内置功能.

提前致谢

小智 17

查看HTTPApp单元.HTTPDecode和HTMLDecode(以及编码功能).您应该在Source/Win32/Internet文件夹中找到它.


Ian*_*oyd 6

HttpApp.HttpDecode 函数不解码 http 实体 ( https://www.w3.org/TR/html4/sgml/entities.html#sym )

例如:∴

function HtmlDecode(s: UnicodeString): UnicodeString;
{ 
   Public domain: No attribution required
   Known issue, it doesn't handle entities with characters code points above $FFFF (65536)
   e.g.: &;

   That's because UTF-16 requires 2 characters to encode one character.
 }

    function UCS4CharToString(uch: UCS4Char): UnicodeString;
    var
        s: UCS4String;
    begin
        SetLength(s, 2);
        s[0] := uch;
        s[1] := 0; //null terminator
        Result := UCS4StringToUnicodeString(s);
    end;

    function GetCharRef(sValue: UnicodeString; StartIndex: Integer; out CharRef: string): UnicodeString;
    var
        i: Integer;
        len: Integer;
        nChar: UCS4Char;
    begin
        {
            Character references come in either decimal or hex forms:

                ♦   //decimal
                ♦  //hexidecimal

            As per the definition:

                CharRef  ::=  '&#' [0-9]+ ';'
                                  |
                                  '&#x' [0-9a-fA-F]+ ';'
        }
        Result := '';
        CharRef := '';

        len := Length(sValue) - StartIndex + 1;
        if len < 4 then
            Exit;
        i := StartIndex;
        if sValue[i] <> '&' then Exit;
        Inc(i);
        if sValue[i] <> '#' then Exit;
        Inc(i);

        if sValue[i] = 'x' then
        begin
            {
                Hex character reference

                    CharRef ::= '&#x' [0-9a-fA-F]+ ';'

                E.g. &#x2666;
            }
            Inc(i); //skip the x
            while CharInSet(sValue[i], ['0'..'9', 'a'..'f', 'A'..'F']) do
            begin
                Inc(i);
                if i > Length(sValue) then
                    Exit;
            end;
            if sValue[i] <> ';' then
                Exit;

            charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
            nChar := StrToInt('$'+Copy(charRef, 4, Length(charRef)-4));
        end
        else
        begin
            {
                Decimal character reference

                    CharRef  ::=  '&#' [0-9]+ ';'

                E.g. &#9830;
            }

            while CharInSet(sValue[i], ['0'..'9']) do
            begin
                Inc(i);
                if i > Length(sValue) then
                    Exit;
            end;
            if sValue[i] <> ';' then
                Exit;

            charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
            nChar := StrToInt(Copy(charRef, 3, Length(charRef)-3));
        end;
        Result := UCS4CharToString(nChar);
    end;

    function GetEntityRef(sValue: string; StartIndex: Integer; out CharRef: string): UnicodeString;

        function IsNameStartChar(ch: WideChar): Boolean;
        begin
            {
                NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
            }
            Result := False;

            case ch of
            ':', 'A'..'Z', '_', 'a'..'z', #$C0..#$D6, #$D8..#$F6, #$F8..#$FF: Result := True;
            #$100..#$2FF, #$370..#$37D, #$37F..#$FFF: Result := True;
            #$1000..#$1FFF, #$200C..#$200D, #$2070..#$218F, #$2C00..#$2FEF, #$3001..#$D7FF, #$F900..#$FDCF, #$FDF0..#$FFFD: Result := True;
            else
                //We assume strings are UTF-16. But by assuming one 16-bit word is the same as one character is just wrong.
                //UTF-16, like UTF-8 can be multi-byte.
                //But it's just so haaaard to support.
                //The correct action is to convert the string to UCS4, where one code-point is always one character.
                case Integer(ch) of
                $10000..$EFFFF: Result := True;
                end;
            end;
        end;

        function IsNameChar(ch: WideChar): Boolean;
        begin
            if IsNameStartChar(ch) then
            begin
                Result := True;
                Exit;
            end;

            case ch of
            '-', '.', '0'..'9', #$B7, #$0300..#$036F, #$203F..#$2040: Result := True;
            else
                Result := False;
            end;
        end;

        type
            THtmlEntity = record
                entity: string;
                ch: UCS4Char;
            end;
        const
            //https://www.w3.org/TR/html4/sgml/entities.html#sym
            //html entities are case sensitive (e.g. "larr" is different from "lArr")
            HtmlEntities: array[0..252] of THtmlEntity = (
                (entity: 'apos';        ch: 39; ), // apostrophe (originally only existed in xml, and not in HTML. Was added to HTML5
                (entity: 'quot';        ch: 34; ),  // quotation mark = APL quote, U+0022
                (entity: 'amp';     ch: 38; ),  // ampersand, U+0026
                (entity: 'lt';          ch: 60; ),  // less-than sign, U+003C
                (entity: 'gt';          ch: 62; ),  // greater-than sign, U+003E
                (entity: 'OElig';       ch: 338;    ),  // latin capital ligature OE, U+0152
                (entity: 'oelig';       ch: 339;    ),  // latin small ligature oe, U+0153
                (entity: 'Scaron';  ch: 352;    ),  // latin capital letter S with caron, U+0160
                (entity: 'scaron';  ch: 353;    ),  // latin small letter s with caron, U+0161
                (entity: 'Yuml';        ch: 376;    ),  // latin capital letter Y with diaeresis, U+0178
                (entity: 'circ';        ch: 710;    ),  // modifier letter circumflex accent, U+02C6
                (entity: 'tilde';       ch: 732;    ),  // small tilde, U+02DC
                (entity: 'nbsp';        ch: 160;    ),  // no-break space = non-breaking space,    U+00A0
                (entity: 'iexcl';       ch: 161;    ),  // inverted exclamation mark, U+00A1
                (entity: 'cent';        ch: 162;    ),  // cent sign, U+00A2
                (entity: 'pound';       ch: 163;    ),  // pound sign, U+00A3
                (entity: 'curren';  ch: 164;    ),  // currency sign, U+00A4
                (entity: 'yen';     ch: 165;    ),  // yen sign = yuan sign, U+00A5
                (entity: 'brvbar';  ch: 166;    ),  // broken bar = broken vertical bar,    U+00A6
                (entity: 'sect';        ch: 167;    ),  // section sign, U+00A7
                (entity: 'uml';     ch: 168;    ),  // diaeresis = spacing diaeresis,    U+00A8
                (entity: 'copy';        ch: 169;    ),  // copyright sign, U+00A9
                (entity: 'ordf';        ch: 170;    ),  // feminine ordinal indicator, U+00AA
                (entity: 'laquo';       ch: 171;    ),  // left-pointing double angle quotation mark = left pointing guillemet, U+00AB
                (entity: 'not';     ch: 172;    ),  // not sign, U+00AC
                (entity: 'shy';     ch: 173;    ),  // soft hyphen = discretionary hyphen,    U+00AD
                (entity: 'reg';     ch: 174;    ),  // registered sign = registered trade mark sign,    U+00AE
                (entity: 'macr';        ch: 175;    ),  // macron = spacing macron = overline  = APL overbar, U+00AF
                (entity: 'deg';     ch: 176;    ),  // degree sign, U+00B0
                (entity: 'plusmn';  ch: 177;    ),  // plus-minus sign = plus-or-minus sign,    U+00B1
                (entity: 'sup2';        ch: 178;    ),  // superscript two = superscript digit two  = squared, U+00B2
                (entity: 'sup3';        ch: 179;    ),  // superscript three = superscript digit three  = cubed, U+00B3
                (entity: 'acute';       ch: 180;    ),  // acute accent = spacing acute,    U+00B4
                (entity: 'micro';       ch: 181;    ),  // micro sign, U+00B5
                (entity: 'para';        ch: 182;    ),  // pilcrow sign = paragraph sign,    U+00B6
                (entity: 'middot';  ch: 183;    ),  // middle dot = Georgian comma = Greek middle dot, U+00B7
                (entity: 'cedil';       ch: 184;    ),  // cedilla = spacing cedilla, U+00B8
                (entity: 'sup1';        ch: 185;    ),  // superscript one = superscript digit one,    U+00B9
                (entity: 'ordm';        ch: 186;    ),  // masculine ordinal indicator,    U+00BA
                (entity: 'raquo';       ch: 187;    ),  // right-pointing double angle quotation mark =  right pointing guillemet, U+00BB
                (entity: 'frac14';  ch: 188;    ),  // vulgar fraction one quarter  = fraction one quarter, U+00BC
                (entity: 'frac12';  ch: 189;    ),  // vulgar fraction one half  = fraction one half, U+00BD
                (entity: 'frac34';  ch: 190;    ),  // vulgar fraction three quarters  = fraction three quarters, U+00BE
                (entity: 'iquest';  ch: 191;    ),  // inverted question mark  = turned question mark, U+00BF
                (entity: 'Agrave';  ch: 192;    ),  // latin capital letter A with grave  = latin capital letter A grave,    U+00C0
                (entity: 'Aacute';  ch: 193;    ),  // latin capital letter A with acute,    U+00C1
                (entity: 'Acirc';       ch: 194;    ),  // latin capital letter A with circumflex,    U+00C2
                (entity: 'Atilde';  ch: 195;    ),  // latin capital letter A with tilde,    U+00C3
                (entity: 'Auml';        ch: 196;    ),  // latin capital letter A with diaeresis,    U+00C4
                (entity: 'Aring';       ch: 197;    ),  // latin capital letter A with ring above  = latin capital letter A ring,    U+00C5
                (entity: 'AElig';       ch: 198;    ),  // latin capital letter AE  = latin capital ligature AE,    U+00C6
                (entity: 'Ccedil';  ch: 199;    ),  // latin capital letter C with cedilla,    U+00C7
                (entity: 'Egrave';  ch: 200;    ),  // latin capital letter E with grave,    U+00C8
                (entity: 'Eacute';  ch: 201;    ),  // latin capital letter E with acute,    U+00C9
                (entity: 'Ecirc';       ch: 202;    ),  // latin capital letter E with circumflex,    U+00CA
                (entity: 'Euml';        ch: 203;    ),  // latin capital letter E with diaeresis,    U+00CB
                (entity: 'Igrave';  ch: 204;    ),  // latin capital letter I with grave,    U+00CC
                (entity: 'Iacute';  ch: 205;    ),  // latin capital letter I with acute,    U+00CD
                (entity: 'Icirc';       ch: 206;    ),  // latin capital letter I with circumflex,    U+00CE
                (entity: 'Iuml';        ch: 207;    ),  // latin capital letter I with diaeresis,    U+00CF
                (entity: 'ETH';     ch: 208;    ),  // latin capital letter ETH, U+00D0
                (entity: 'Ntilde';  ch: 209;    ),  // latin capital letter N with tilde,    U+00D1
                (entity: 'Ograve';  ch: 210;    ),  // latin capital letter O with grave,    U+00D2
                (entity: 'Oacute';  ch: 211;    ),  // latin capital letter O with acute,    U+00D3
                (entity: 'Ocirc';       ch: 212;    ),  // latin capital letter O with circumflex,    U+00D4
                (entity: 'Otilde';  ch: 213;    ),  // latin capital letter O with tilde,    U+00D5
                (entity: 'Ouml';        ch: 214;    ),  // latin capital letter O with diaeresis,    U+00D6
                (entity: 'times';       ch: 215;    ),  // multiplication sign, U+00D7
                (entity: 'Oslash';  ch: 216;    ),  // latin capital letter O with stroke  = latin capital letter O slash,    U+00D8
                (entity: 'Ugrave';  ch: 217;    ),  // latin capital letter U with grave,    U+00D9
                (entity: 'Uacute';  ch: 218;    ),  // latin capital letter U with acute,    U+00DA
                (entity: 'Ucirc';       ch: 219;    ),  // latin capital letter U with circumflex,    U+00DB
                (entity: 'Uuml';        ch: 220;    ),  // latin capital letter U with diaeresis,    U+00DC
                (entity: 'Yacute';  ch: 221;    ),  // latin capital letter Y with acute,    U+00DD
                (entity: 'THORN';       ch: 222;    ),  // latin capital letter THORN,    U+00DE
                (entity: 'szlig';       ch: 223;    ),  // latin small letter sharp s = ess-zed,    U+00DF
                (entity: 'agrave';  ch: 224;    ),  // latin small letter a with grave  = latin small letter a grave,    U+00E0
                (entity: 'aacute';  ch: 225;    ),  // latin small letter a with acute,    U+00E1
                (entity: 'acirc';       ch: 226;    ),  // latin small letter a with circumflex,    U+00E2
                (entity: 'atilde';  ch: 227;    ),  // latin small letter a with tilde,    U+00E3
                (entity: 'auml';        ch: 228;    ),  // latin small letter a with diaeresis,    U+00E4
                (entity: 'aring';       ch: 229;    ),  // latin small letter a with ring above  = latin small letter a ring,    U+00E5
                (entity: 'aelig';       ch: 230;    ),  // latin small letter ae  = latin small ligature ae, U+00E6
                (entity: 'ccedil';  ch: 231;    ),  // latin small letter c with cedilla,    U+00E7
                (entity: 'egrave';  ch: 232;    ),  // latin small letter e with grave,    U+00E8
                (entity: 'eacute';  ch: 233;    ),  // latin small letter e with acute,    U+00E9
                (entity: 'ecirc';       ch: 234;    ),  // latin small letter e with circumflex,    U+00EA
                (entity: 'euml';        ch: 235;    ),  // latin small letter e with diaeresis,    U+00EB
                (entity: 'igrave';  ch: 236;    ),  // latin small letter i with grave,    U+00EC
                (entity: 'iacute';  ch: 237;    ),  // latin small letter i with acute,    U+00ED
                (entity: 'icirc';       ch: 238;    ),  // latin small letter i with circumflex,    U+00EE
                (entity: 'iuml';        ch: 239;    ),  // latin small letter i with diaeresis,    U+00EF
                (entity: 'eth';     ch: 240;    ),  // latin small letter eth, U+00F0
                (entity: 'ntilde';  ch: 241;    ),  // latin small letter n with tilde,    U+00F1
                (entity: 'ograve';  ch: 242;    ),  // latin small letter o with grave,    U+00F2
                (entity: 'oacute';  ch: 243;    ),  // latin small letter o with acute,    U+00F3
                (entity: 'ocirc';       ch: 244;    ),  // latin small letter o with circumflex,    U+00F4
                (entity: 'otilde';  ch: 245;    ),  // latin small letter o with tilde,    U+00F5
                (entity: 'ouml';        ch: 246;    ),  // latin small letter o with diaeresis,    U+00F6
                (entity: 'divide';  ch: 247;    ),  // division sign, U+00F7
                (entity: 'oslash';  ch: 248;    ),  // latin small letter o with stroke,    = latin small letter o slash,    U+00F8
                (entity: 'ugrave';  ch: 249;    ),  // latin small letter u with grave,    U+00F9
                (entity: 'uacute';  ch: 250;    ),  // latin small letter u with acute,    U+00FA
                (entity: 'ucirc';       ch: 251;    ),  // latin small letter u with circumflex,    U+00FB
                (entity: 'uuml';        ch: 252;    ),  // latin small letter u with diaeresis,    U+00FC
                (entity: 'yacute';  ch: 253;    ),  // latin small letter y with acute,    U+00FD
                (entity: 'thorn';       ch: 254;    ),  // latin small letter thorn,    U+00FE
                (entity: 'yuml';        ch: 255;    ),  // latin small letter y with diaeresis,    U+00FF
                (entity: 'fnof';        ch: 402;    ),  // latin small f with hook = function  = florin, U+0192
                (entity: 'Alpha';       ch: 913;    ),  // greek capital letter alpha, U+0391
                (entity: 'Beta';        ch: 914;    ),  // greek capital letter beta, U+0392
                (entity: 'Gamma';       ch: 915;    ),  // greek capital letter gamma,    U+0393
                (entity: 'Delta';       ch: 916;    ),  // greek capital letter delta,    U+0394
                (entity: 'Epsilon'; ch: 917;    ),  // greek capital letter epsilon, U+0395
                (entity: 'Zeta';        ch: 918;    ),  // greek capital letter zeta, U+0396
                (entity: 'Eta';     ch: 919;    ),  // greek capital letter eta, U+0397
                (entity: 'Theta';       ch: 920;    ),  // greek capital letter theta,    U+0398
                (entity: 'Iota';        ch: 921;    ),  // greek capital letter iota, U+0399
                (entity: 'Kappa';       ch: 922;    ),  // greek capital letter kappa, U+039A
                (entity: 'Lambda';  ch: 923;    ),  // greek capital letter lambda,    U+039B
                (entity: 'Mu';          ch: 924;    ),  // greek capital letter mu, U+039C
                (entity: 'Nu';          ch: 925;    ),  // greek capital letter nu, U+039D
                (entity: 'Xi';          ch: 926;    ),  // greek capital letter xi, U+039E
                (entity: 'Omicron'; ch: 927;    ),  // greek capital letter omicron, U+039F
                (entity: 'Pi';          ch: 928;    ),  // greek capital letter pi, U+03A0
                (entity: 'Rho';     ch: 929;    ),  // greek capital letter rho, U+03A1
                // there is no Sigmaf, and no U+03A2 character either
                (entity: 'Sigma';       ch: 931;    ),  // greek capital letter sigma,    U+03A3
                (entity: 'Tau';     ch: 932;    ),  // greek capital letter tau, U+03A4
                (entity: 'Upsilon'; ch: 933;    ),  // greek capital letter upsilon,    U+03A5
                (entity: 'Phi';     ch: 934;    ),  // greek capital letter phi,    U+03A6
                (entity: 'Chi';     ch: 935;    ),  // greek capital letter chi, U+03A7
                (entity: 'Psi';     ch: 936;    ),  // greek capital letter psi,    U+03A8
                (entity: 'Omega';       ch: 937;    ),  // greek capital letter omega,    U+03A9
                (entity: 'alpha';       ch: 945;    ),  // greek small letter alpha,    U+03B1
                (entity: 'beta';        ch: 946;    ),  // greek small letter beta, U+03B2
                (entity: 'gamma';       ch: 947;    ),  // greek small letter gamma,    U+03B3
                (entity: 'delta';       ch: 948;    ),  // greek small letter delta,    U+03B4
                (entity: 'epsilon'; ch: 949;    ),  // greek small letter epsilon,    U+03B5
                (entity: 'zeta';        ch: 950;    ),  // greek small letter zeta, U+03B6
                (entity: 'eta';     ch: 951;    ),  // greek small letter eta, U+03B7
                (entity: 'theta';       ch: 952;    ),  // greek small letter theta,    U+03B8
                (entity: 'iota';        ch: 953;    ),  // greek small letter iota, U+03B9
                (entity: 'kappa';       ch: 954;    ),  // greek small letter kappa,    U+03BA
                (entity: 'lambda';  ch: 955;    ),  // greek small letter lambda,    U+03BB
                (entity: 'mu';          ch: 956;    ),  // greek small letter mu, U+03BC
                (entity: 'nu';          ch: 957;    ),  // greek small letter nu, U+03BD
                (entity: 'xi';          ch: 958;    ),  // greek small letter xi, U+03BE
                (entity: 'omicron'; ch: 959;    ),  // greek small letter omicron, U+03BF NEW
                (entity: 'pi';          ch: 960;    ),  // greek small letter pi, U+03C0
                (entity: 'rho';     ch: 961;    ),  // greek small letter rho, U+03C1


小智 5

这是我的HTMLDecode程序(从CGs HTTPApp单元略微修改):

function HTMLDecode(const AStr: String): String;
var
  Sp, Rp, Cp, Tp: PChar;
  S: String;
  I, Code: Integer;
begin
  SetLength(Result, Length(AStr));
  Sp := PChar(AStr);
  Rp := PChar(Result);
  Cp := Sp;
  try
    while Sp^ <> #0 do
    begin
      case Sp^ of
        '&': begin
               Cp := Sp;
               Inc(Sp);
               case Sp^ of
                 'a': if AnsiStrPos(Sp, 'amp;') = Sp then  { do not localize }
                      begin
                        Inc(Sp, 3);
                        Rp^ := '&';
                      end;
                 'l',
                 'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize }
                      begin
                        Cp := Sp;
                        Inc(Sp, 2);
                        while (Sp^ <> ';') and (Sp^ <> #0) do
                          Inc(Sp);
                        if Cp^ = 'l' then
                          Rp^ := '<'
                        else
                          Rp^ := '>';
                      end;
                 'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then  { do not localize }
                      begin
                        Inc(Sp, 4);
                        Rp^ := ' ';
                      end;
                 'q': if AnsiStrPos(Sp, 'quot;') = Sp then  { do not localize }
                      begin
                        Inc(Sp,4);
                        Rp^ := '"';
                      end;
                 '#': begin
                        Tp := Sp;
                        Inc(Tp);
                        while (Sp^ <> ';') and (Sp^ <> #0) do
                          Inc(Sp);
                        SetString(S, Tp, Sp - Tp);
                        Val(S, I, Code);
                        Rp^ := Chr((I));
                      end;
                 else
                   Exit;
               end;
           end
      else
        Rp^ := Sp^;
      end;
      Inc(Rp);
      Inc(Sp);
    end;
  except
  end;
  SetLength(Result, Rp - PChar(Result));
end;
Run Code Online (Sandbox Code Playgroud)

  • 你修改了什么,为什么? (4认同)