Delphi下获取网络数据乱码问题

根据不同字符集的网页的内容的获取。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// 函数说明: 获取当前页面的Html源代码
// @param AURL : 页面URL地址
// @param ACharset: 页面字符集
// @param AState : 成功状态
//
// @returns 返回成功状态(True表示成功,False表示失败)
function GetHtmlContent(const AUrl, ACharset: string; var AState: Boolean): string;
var
vStrStream: TStringStream; //用于接收数据的流
vNetHandle, vURLHandle: HINTERNET;
vBuffer: PChar;
vReadSize: Cardinal;
vCharsetInt: Integer;
begin
try
if UpperCase(ACharset) = 'UTF-8' then
vCharsetInt := 65001//4
else if UpperCase(ACharset) = 'GB2312' then
vCharsetInt := 20936//2
else if Pos(UpperCase(ACharset), 'MacRoman,MacCroatian') > 0 then
vCharsetInt := 20127//1
else if Pos(UpperCase(ACharset), 'GB-2312,ASCII, KOI8-R,KOI8-U, ISO-8859-1..ISO-8859-16,') > 0 then
vCharsetInt := 20936//2
else if Pos(UpperCase(ACharset), 'UTF8,EUC-KR') > 0 then
vCharsetInt := 51932//3
else if Pos(UpperCase(ACharset), 'EUC-JP,UTF-32LE,UTF-32BE') > 0 then
vCharsetInt := 65001//4
else if Pos(UpperCase(ACharset), 'UTF-7,UTF7,') > 0 then
vCharsetInt := 65000 //6
else
vCharsetInt := 65001; // 4
vStrStream := TStringStream.Create('', vCharsetInt);
try
GetMem(vBuffer, 65536);
vReadSize := 0;
vNetHandle := InternetOpen('Delphi', INTERNET_OPEN_TYPE_PRECONFIG, nil, nil, 0);
vURLHandle := InternetOpenUrl(vNetHandle, PChar(AUrl), nil, 0, INTERNET_FLAG_RELOAD, 0);
repeat
InternetReadFile(vURLHandle, vBuffer, 1000, vReadSize);
if vReadSize <> 0 then
vStrStream.Write(vBuffer^, vReadSize);
until vReadSize = 0;
InternetCloseHandle(vURLHandle);
InternetCloseHandle(vNetHandle);
FreeMem(vBuffer);
AState := True;
Result := vStrStream.DataString;
finally
vStrStream.Free;
end;
except
on e: Exception do
begin
AState := False;
Result := '[Error]'+AUrl + '|' + e.Message;
end;
end;
end;
System.SysUtils.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
...
function GetCPInfo(CodePage: Cardinal; var lpCPInfo: TCPInfo): Boolean;
begin
Result := True;
case CodePage of
// Code page identifiers understood directly by iconv_open()
154, 367, 437, 737, 775, 819, 850, 852,
853, 855..858, 860..866, 869, 874, 922: lpCPInfo.MaxCharSize := 1;
932, 936, 943, 949, 950: lpCPInfo.MaxCharSize := 2;
1046, 1124, 1125, 1129, 1133, 1161, 1162, 1163, 1250..1258: lpCPInfo.MaxCharSize := 1;
1361: lpCPInfo.MaxCharSize := 2;
// Code page indentifiers translated to iconv_open() encoding names (by LocaleNameFromCodePage)
10000, 10004..10007, 10010, 10017, 10021,
10029, 10079, 10081, 10082: lpCPInfo.MaxCharSize := 1; // MacRoman .. MacCroatian
12000, 12001: lpCPInfo.MaxCharSize := 4; // UTF-32LE, UTF-32BE
20127, 20866: lpCPInfo.MaxCharSize := 1; // ASCII, KOI8-R
20932: lpCPInfo.MaxCharSize := 3; // EUC-JP
20936: lpCPInfo.MaxCharSize := 2; // GB2312, EUC-KR
21866, 28591..28601, 28603..28606: lpCPInfo.MaxCharSize := 1; // KOI8-U, ISO-8859-1..ISO-8859-16
50221: lpCPInfo.MaxCharSize := 9; // ISO-2022-JP
50225: lpCPInfo.MaxCharSize := 7; // ISO-2022-KR
50227: lpCPInfo.MaxCharSize := 8; // ISO-2022-CN
51932: lpCPInfo.MaxCharSize := 3; // EUC-JP
51936, 51949: lpCPInfo.MaxCharSize := 2; // GB2312, EUC-KR
51950, 52936, 54936: lpCPInfo.MaxCharSize := 4; // EUC-TW, HZ-GB-2312, GB18030
65000: lpCPInfo.MaxCharSize := 6; // UTF-7
65001: lpCPInfo.MaxCharSize := 4; // UTF-8
else
Result := False;
end;
end;
...
如果觉得我的文章对您有用,请随意打赏。您的支持将是我继续创作的动力!