C # Implement Web Information Automatic Catch

xiaoxiao2021-03-06 42

NetBug - crawler V1.02 development notes

background

With the popularity of the Internet, network information is growing with extremely speed. It is a very cumbersome thing that finds it needs in so many data. It is a very cumbersome thing. How to get the trouble after finding the information you need. This requires an Internet information capture program to replace artificial operations.

The so-called Internet information capture procedure is that the program collects the corresponding information in accordance with the user's keyword or key website, and provides the information format you want.

The increase in the amount of information will bring the amount of information on the information website publishing personnel to achieve information on the implementation of the information distribution system.

Dynamic release, reduce staff workload, instant tracking of the latest information, requires the automatic information provider, so the Internet information capture program came into being.

aims

Implement a custom website information classification, deposit a local database, generate a static page or other user-defined information structure, and download multimedia files related to information.

Develop

l target site structure analysis

This step is to accurately grab the information key.

First, choose the page with high update frequency as a grab address, and then analyze the characteristics of the content page URL.

Then analyze the element characteristics of the information page to capture the information page, such as the title position, content location, etc. to obtain the positioning mark.

Write the above information into your own profile or save it to the database.

Each website needs to be analyzed, write a separate profile for capture programs.

l information extraction

Get the page URL according to the configuration file, use the HTTPWebRequest class to get content:

/ / Get http page functions

Public string get_http (string a_strurl, int timeout)

{

String strresult;

Try

{

Httpwebrequest myreq = (httpwebrequest) httpwebrequest.create (a_strurl);

MyReq.Timeout = Timeout;

HttpWebResponse httpwresp = (httpwebresponse) MyReq.getResponse ();

Stream MyStream = httpwresp.getResponseStream ();

StreamReader SR = New StreamReader (MyStream, Encoding.Default);

StringBuilder strbuilder = new stringbuilder ();

While (-1! = sr.peek ())

{

Strbuilder.Append (sr.readline () "/ r / n");

}

Strresult = strbuilder.tostring ();

}

Catch (Exception Exp)

{

Strresult = "Error:" Exp.Message;

}

Return Strresult;

}

After obtaining the page content, the connection address in the analysis page takes the URL to be captured:

// Process page title and link

Public String SniffWeburl (String Urlstr, String Blockb, String Blocke)

{

String urlch1 = "";

String urlch2 = "";

INT END_N1 = 0;

INT END_NUMS = 0;

INT END_NUMS1 = 0;

INT END_NUMS2 = 0;

INT END_NUMS3 = 0;

String Reutstr = ""; string retitle = "";

String Ret = "";

Try

{

INT POS01 = urlstr.indexof (".");

INT POS02 = Urlstr.lastIndexof ("/");

IF (POS01 <0)

{

""; "

}

IF (POS02 <0)

{

""; "

}

INT POS03 = URLSTR.INDEXOF ("/", POS01);

IF (POS03 <0)

{

URLCH1 = URLSTR;

URLCH2 = URLSTR;

}

Else

{

URLCH1 = urlstr.substring (0, POS03);

URLCH2 = urlstr.substring (0, POS02);

}

String tmpallstr = new publicfun (). get_http (urlstr, time1);

INT POS1 = TMPallStr.Indexof (Blockb);

INT POS2 = TMPallstr.indexof (Blocke, POS1 blockb.length);

IF (POS1> 0 && POS2> 0 && POS2> POS1)

{

Ret = tmpallstr.substring (POS1 Blockb.Length, POS2 - POS1 - Blockb.length);

Ret = ret.substring (RET.INDEXOF ("<"));

While (Ret.indexof (" = 0)

{

Ret = ret.substring (0, ret.indexof ("

}

While (ret.indexof (" = 0)

{

Ret = ret.substring (0, ret.indexof ("

}

While (ret.indexof ("href =")> = 0)

{

Ret = ret.substring (0, ret.indexof ("href =")) "href =" ret.substring (Ret.indexof ("href =") 5);

}

While (ret.indexof ("href =")> = 0)

{

Ret = ret.substring (0, ret.indexof ("href =")) "href =" ret.substring (Ret.indexof ("href =") 5);}

While (ret.indexof ("href = ')> = 0)

{

Ret = ret.substring (0, ret.indexof ("href = ')) " href = / " ret.substring (Ret.indexof (" href =') 6);

}

TMPallstr = RET;

INT begin_nums = tmpallstr.indexof ("href =");

While (begin_nums> = 0)

{

String tmpstra = "";

String tmpstrb = tmpallstr.substring (begin_nums 5, 1);

IF (tmpstrb == "/")

{

END_N1 = Begin_NUMS 6;

IF ((end_n1 1)> TMPallStr.length)

{

""; "

}

Tmpstra = tmpallstr.substring (begin_nums 6, 1);

}

Else

{

END_N1 = Begin_NUMS 5;

TmpStra = TmpStrb;

}

IF (tmpstra == "#")

{

TMPallstr = tmpallstr.substring (end_n1);

Begin_nums = tmpallstr.indexof ("href =");

}

Else

{

End_nums1 = tmpallstr.indexof ("", END_N1);

End_nums2 = tmpallstr.indexof (">", end_n1);

End_nums3 = tmpallstr.indexof ("

IF ((End_nums3> = 0) && (end_nums2> = 0)))

{

Retitle = tmpallstr.substring (end_nums2 1, end_nums3 - end_nums2 - 1);

IF (End_nums1> End_nums2)

{

END_NUMS = end_nums2;

}

Else

{

IF (End_nums1 <0)

{

END_NUMS = end_nums2;

}

Else

{

END_NUMS = END_NUMS1;

}

String str4 = tmpallstr.substring (end_nums-1, end_nums - end_nums 1);

IF (str4 == "/" || str4 == "')

{

END_NUMS = END_NUMS - 1;

}

String stotalone = tmpallstr.substring (end_n1, end_nums - end_n1);

IF (Stotalone.Indexof ("http: //") <0)

{

IF (Stotalone.indexof ("/") == 0)

{

STOTALONE = URLCH1 STOTALONE;

}

Else

{

INT linshiintnum = 0;

INT FLAGS = 0;

String urlchange = urlstr ;;

While (stotalone.indexof ("../")> = 0)

{

STOTALONE = STOTALONE.SUBSTRINE (STOTALONE.INDEXOF ("../") 3);

LINSHIINTNUM = LinshiintNum 1;

Flags = flags 1;

}

While ((Urlchange.lastIndexof ("/")> = 0) && (linshiintnum> = 0)))

{

Urlchange = urlchange.substring (0, urlchange.lastIndexof ("/"));

LINSHIINTNUM = LinshiintNum - 1;

}

IF (Flags == 0)

{

STOTALONE = URLCH2 "/" STOTALONE;

}

Else

{

STOTALONE = URLChange "/" STOTALONE;

}

Reutstr = Reutstr new publicfun (). Removehtmlcode (RetiTle) STOTALONE

TMPallstr = tmpallstr.substring (end_nums3 4);

Begin_nums = tmpallstr.indexof ("href =");

}

Else

{

Begin_nums = -1;

}

RETURN Reutstr;

}

Catch (Exception E)

{

""; "

}

After getting the URL to grab the content, process the page:

/ / Get link content and classify

Public String GetWebContent (String Gatherurl, String Suburl, String Subtitle, String B_Content, String E_CONTENT, STRING B_FILTER, STRING E_FILTER, STRING ROOT)

{

String tmpallstr = ""

String dfstrb = "";

String dfstre = "";

String repicstr = ""; // picture return path

String recontentstr = "";

String Pichtml = "Images"; // local image path

String urlch1 = ""; string urlch2 = "";

INT POS1 = GATHERURL.INDEXOF (".");

INT POS2 = GATHERURL.lastIndexof ("/");

IF (POS1 <0)

{

""; "

}

IF (POS2 <0)

{

""; "

}

INT POS3 = GATHERURL.INDEXOF ("/", POS1);

IF (POS3 <0)

{

URLCH1 = GATHERURL;

URLCH2 = GATHERURL;

}

Else

{

URLCH1 = Gatherurl.Substring (0, POS3);

URLCH2 = Gatherurl.Substring (0, POS2);

}

TMPallstr = new publicfun (). get_http (Suburl, Time1);

// release source

String Docromstr = ""

IF (dfstrb! = "" && dfstre! = ")

{

IF (tmpallstr! = "")

{

INT B_DOCF = TMPallStr.Indexof (DFSTRB);

IF (b_docf> 0)

{

INT E_DOCF = TmPallStr.indexof (DFSTRE, B_DOCF DFSTRB.LENGTH);

IF (e_docf> 0 && e_docf> b_docf && e_docf - b_docf <20)

{

Docfromstr = tmpallstr.substring (b_docf dfstrb.length, e_docf - b_docf - dfstrb.length);

}

// Take content

IF (tmpallstr! = "")

{

Int begin_strnum = tmpallstr.indexof (b_content);

IF (Begin_Strnum <0)

{

""; "

}

INT end_strnum = tmpallstr.indexof (e_content, begin_strnum b_content.length);

IF (End_Strnum <0)

{

""; "

}

String stotalsubm = ""

IF (End_Strnum> Begin_Strnum)

{

STOTALSUBM = TMPallstr.substring (begin_strnum, end_strnum - begin_strnum);

}

IF (STOTALSUBM == "")

{

""; "

}

// Filter useless information

Int bfnum = numberSubm.indexof (b_filter);

IF (bfnum> -1)

{

INT EFNUM = STOTALSUBM.INDEXOF (E_FILTER, BFNUM);

IF (EFNUM> -1)

{

IF (EFNUM> BFNUM) {

STOTALSUBM = STOTALSUBM.SUBSTRING (0, BFNUM) STOTALSUBM.SUBSTRING (EFNUM E_FILTER.LENGTH);

}

/ / Format the picture mark

WHILE (STOTALSUBM.INDEXOF ("src =")> = 0)

{

STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ")) " SRC = " STOTALSUBM.SUBSTRING (SRC =" 4);

}

WHILE (STOTALSUBM.INDEXOF ("src =")> = 0)

{

STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ")) " SRC = " STOTALSUBM.SUBSTRING (SRC =" 4);

}

While (STOTALSUBM.INDEXOF ("src = '")> = 0)

{

STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ')) "SRC = /" " STOTALSUBM.SUBSTRING (SRC =') 5);

}

// Take a picture address

INT END_N12 = 0;

INT END_NUMS2 = 0;

INT begin_nums2 = numberSubm.indexof ("src =");

While (begin_nums2> = 0)

{

String tmpstr = numberSubm.substring (begin_nums2 4, 1);

IF (tmpstr == "/")

{

END_N12 = Begin_NUMS2 5;

}

Else

{

END_N12 = Begin_NUMS2 4;

}

Int end_nums2a = numberSubm.indexof ("", end_n12);

Int end_nums2b = numberSubm.indexof (">", end_n12;

IF (end_nums2b <0)

{

Break;

}

IF (End_nums2a> End_nums2b)

{

END_NUMS2 = END_NUMS2B;

}

Else

{

IF (End_nums2a <0)

{

END_NUMS2 = END_NUMS2B;

}

Else

{

END_NUMS2 = End_nums2a;

}

Tmpstr = STOTALSUBM.SUBSTRING (End_NUMS2-1, 1);

IF (tmpstr == "/" || tmpstr == "') {

END_NUMS2 = END_NUMS2 - 1;

}

String tmpicstr = numberSubm.substring (End_n12, end_nums2 - end_n12);

IF (TMPPCSTR.INDEXOF ("http: //") <0)

{

IF (TMPPICSTR.INDEXOF ("/") == 0)

{

TMPPICSTR = URLCH1 TMPPICSTR;

}

Else

{

INT linshiintnum = 0;

INT FLAGS = 0;

String Urlchange = Suburl;

While (TMPPICSTR.INDEXOF ("../")> = 0)

{

Tmpicstr = tmpicstr.substring (TMPPCSTR.INDEXOF ("../") 3);

LINSHIINTNUM = LinshiintNum 1;

Flags = flags 1;

}

While ((Urlchange.lastIndexof ("/")> = 0) && (linshiintnum> = 0)))

{

Urlchange = urlchange.substring (0, urlchange.lastIndexof ("/"));

LINSHIINTNUM = LinshiintNum - 1;

}

IF (Flags == 0)

{

TMPPICSTR = URLCH2 "/" TMPPICSTR;

}

Else

{

TMPPICSTR = URLChange "/" TMPPICSTR;

}

// TmpPCSTR = TmpCStr.tolower ();

String TmpicsTRTMP = TMPPICSTR.TOLOWER ();

// if (TMPPCSTR.INDEXOF (".jpg")> 0 || TMPPCSTR.INDEXOF (".gif")> 0 || TMPPCSTR.INDEXOF (".bmp")> 0)

IF (TMPPICSTRTMP.INDEXOF (".jpg")> 0 || TMPPICSTRTMP.INDEXOF (".gif")> 0 || TMPPICSTRTMP.INDEXOF (".bmp")> 0)

{

REPICSTR = Repicstr "||" TMPPICSTR;

INT flagn2 = tmpicstr.lastindexof ("/");

String filen2 = pichtml tmpicstr.substring (flagn2);

STOTALSUBM = STOTALSUBM.SUBSTRING (0, end_nums2) "> ******" filen2 "****** <" numberSubm.substring (end_nums2); begin_nums2 = numberSubm.indexof ("src =" , END_NUMS2 Filen2.Length 22);

}

Else

{

Begin_nums2 = numberSubm.indexOf ("src =", end_nums2 4);

}

IF (repicstr.length> 2)

REPICSTR = repicstr.substring (2);

// Content processing formatting key tag

While (StotalSubm.indexof (" = 0)

{