注册 登录
编程论坛 C语言论坛

求助 正则表达式获取网页编码(gb2312)失败

追梦人zmrghy 发布于 2023-03-04 03:03, 701 次点击
只有本站会员才能查看附件,请 登录


汉字是乱码,想在代码中获取gb2312


只有本站会员才能查看附件,请 登录


RegexBuddy 4 测试正则表达式 ".*?charset[=](.*?)[\"]>.*" ,显示正确。。。




只有本站会员才能查看附件,请 登录


实际应用是,获取 "gb2312" 失败,是什么原因。


程序代码:
#pragma once


namespace GetWebpageCode {

    using namespace System;
    using namespace System::ComponentModel;
    using namespace System::Collections;
    using namespace System::Windows::Forms;
    using namespace System::Data;
    using namespace System::Drawing;
    using namespace System::Net;
    using namespace System::Text;
    using namespace System::Text::RegularExpressions;

    /// <summary>
   
/// Form1 摘要
   
/// </summary>
    public ref class Form1 : public System::Windows::Forms::Form
    {
    public:
        Form1(void)
        {
            InitializeComponent();
            //
            
//TODO:  在此处添加构造函数代码
            
//
        }

    protected:
        /// <summary>
        
/// 清理所有正在使用的资源。
        
/// </summary>
        ~Form1()
        {
            if (components)
            {
                delete components;
            }
        }
    private: System::Windows::Forms::Button^ button1;
    protected:
    private: System::Windows::Forms::Button^ button2;
    private: System::Windows::Forms::Button^ button3;
    private: System::Windows::Forms::Button^ button4;
    private: System::Windows::Forms::Button^ button5;
    private: System::Windows::Forms::Button^ button6;
    private: System::Windows::Forms::TextBox^ textBox1;
    private: System::Windows::Forms::TextBox^ textBox2;

    private:
        /// <summary>
        
/// 必需的设计器变量。
        
/// </summary>
        System::ComponentModel::Container ^components;

#pragma region Windows Form Designer generated code
        /// <summary>
        
/// 设计器支持所需的方法 - 不要修改
        
/// 使用代码编辑器修改此方法的内容。
        
/// </summary>
        void InitializeComponent(void)
        {
            this->button1 = (gcnew System::Windows::Forms::Button());
            this->button2 = (gcnew System::Windows::Forms::Button());
            this->button3 = (gcnew System::Windows::Forms::Button());
            this->button4 = (gcnew System::Windows::Forms::Button());
            this->button5 = (gcnew System::Windows::Forms::Button());
            this->button6 = (gcnew System::Windows::Forms::Button());
            this->textBox1 = (gcnew System::Windows::Forms::TextBox());
            this->textBox2 = (gcnew System::Windows::Forms::TextBox());
            this->SuspendLayout();
            //
            
// button1
            
//
            this->button1->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button1->Location = System::Drawing::Point(33, 53);
            this->button1->Name = L"button1";
            this->button1->Size = System::Drawing::Size(100, 35);
            this->button1->TabIndex = 0;
            this->button1->Text = L"button1";
            this->button1->UseVisualStyleBackColor = true;
            this->button1->Click += gcnew System::EventHandler(this, &Form1::button1_Click);
            //
            
// button2
            
//
            this->button2->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button2->Location = System::Drawing::Point(196, 53);
            this->button2->Name = L"button2";
            this->button2->Size = System::Drawing::Size(100, 35);
            this->button2->TabIndex = 1;
            this->button2->Text = L"button2";
            this->button2->UseVisualStyleBackColor = true;
            //
            
// button3
            
//
            this->button3->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button3->Location = System::Drawing::Point(359, 53);
            this->button3->Name = L"button3";
            this->button3->Size = System::Drawing::Size(100, 35);
            this->button3->TabIndex = 2;
            this->button3->Text = L"button3";
            this->button3->UseVisualStyleBackColor = true;
            //
            
// button4
            
//
            this->button4->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button4->Location = System::Drawing::Point(522, 53);
            this->button4->Name = L"button4";
            this->button4->Size = System::Drawing::Size(100, 35);
            this->button4->TabIndex = 3;
            this->button4->Text = L"button4";
            this->button4->UseVisualStyleBackColor = true;
            //
            
// button5
            
//
            this->button5->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button5->Location = System::Drawing::Point(685, 53);
            this->button5->Name = L"button5";
            this->button5->Size = System::Drawing::Size(100, 35);
            this->button5->TabIndex = 4;
            this->button5->Text = L"button5";
            this->button5->UseVisualStyleBackColor = true;
            //
            
// button6
            
//
            this->button6->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->button6->Location = System::Drawing::Point(848, 53);
            this->button6->Name = L"button6";
            this->button6->Size = System::Drawing::Size(100, 35);
            this->button6->TabIndex = 5;
            this->button6->Text = L"button6";
            this->button6->UseVisualStyleBackColor = true;
            //
            
// textBox1
            
//
            this->textBox1->Font = (gcnew System::Drawing::Font(L"宋体", 12));
            this->textBox1->Location = System::Drawing::Point(32, 12);
            this->textBox1->Name = L"textBox1";
            this->textBox1->Size = System::Drawing::Size(915, 26);
            this->textBox1->TabIndex = 6;
            //
            
// textBox2
            
//
            this->textBox2->Location = System::Drawing::Point(0, 104);
            this->textBox2->Multiline = true;
            this->textBox2->Name = L"textBox2";
            this->textBox2->ScrollBars = System::Windows::Forms::ScrollBars::Vertical;
            this->textBox2->Size = System::Drawing::Size(1007, 536);
            this->textBox2->TabIndex = 7;
            //
            
// Form1
            
//
            this->AutoScaleDimensions = System::Drawing::SizeF(6, 12);
            this->AutoScaleMode = System::Windows::Forms::AutoScaleMode::Font;
            this->ClientSize = System::Drawing::Size(1008, 641);
            this->Controls->Add(this->textBox2);
            this->Controls->Add(this->textBox1);
            this->Controls->Add(this->button6);
            this->Controls->Add(this->button5);
            this->Controls->Add(this->button4);
            this->Controls->Add(this->button3);
            this->Controls->Add(this->button2);
            this->Controls->Add(this->button1);
            this->Name = L"Form1";
            this->Text = L"Form1";
            this->ResumeLayout(false);
            this->PerformLayout();

        }
#pragma endregion
    private:String^ WebPageCode;
    private:System::Void button1_Click(System::Object^ sender, System::EventArgs^ e)
    {
        WebClient^ myWebClient = gcnew WebClient;

        array<Byte>^ mybuffer = myWebClient->DownloadData(textBox1->Text);
        //WebPageCode = Encoding::GetEncoding(936)->GetString(mybuffer);
        
//UTF8Encoding temp;
        ASCIIEncoding temp;
        WebPageCode=temp.GetString(mybuffer);
        Regex^ charcode=gcnew Regex(".*?charset[=](.*?)[\"] > .*");
        String^ StrCode = charcode->Replace(WebPageCode, "$1");
        WebPageCode = Encoding::GetEncoding(StrCode)->GetString(mybuffer);
        textBox2->Text = WebPageCode;
    }
};
}
2 回复
#2
阳光上的桥2023-03-09 17:27
只有本站会员才能查看附件,请 登录
#3
追梦人zmrghy2023-03-11 00:22
回复 2楼 阳光上的桥
表达式书写错误了,不是主要问题。
WebClient^ myWebClient = gcnew WebClient;
    array<Byte>^ mybuffer = myWebClient->DownloadData(textBox1->Text);
    ASCIIEncoding temp;
    String^ WebPageCode = temp.GetString(mybuffer);

 WebPageCode里面有很多个字符串,正则匹配的内容是从 WebPageCode 开始到时“\r\n”回车换行就结束了,不再匹配后面内容。。。。
弄了几天终于想到办法了。。。
程序代码:
private:System::Void button1_Click(System::Object^ sender, System::EventArgs^ e)
{
    WebClient^ myWebClient = gcnew WebClient;
    array<Byte>^ mybuffer = myWebClient->DownloadData(textBox1->Text);
    ASCIIEncoding temp;
    String^ WebPageCode = temp.GetString(mybuffer);
    String^ StrCode = GetCharacterEncoding(WebPageCode);
    WebPageCode = Encoding::GetEncoding(StrCode)->GetString(mybuffer);
    textBox2->Text = WebPageCode;
}
private: String^ GetCharacterEncoding(String^ Str)
{
    Regex^ Rex1 = gcnew Regex(".*?charset[=](.*?)[\"].*");
    array<String^>^ StrArr1 = Regex::Split(Str, "\r\n");
    for each (auto i in StrArr1)
    {
        if (Rex1->IsMatch(i))
        {
            return Rex1->Replace(i, "$1");
        }
    }
    return nullptr;
}

只有本站会员才能查看附件,请 登录


[此贴子已经被作者于2023-3-11 00:24编辑过]

1