}
While (ret.indexof (" a")> = 0)
{
Ret = ret.substring (0, ret.indexof (" a")) " a" ret.substring (Ret.indexof (" a") 3);
}
While (ret.indexof ("href =")> = 0)
{
Ret = ret.substring (0, ret.indexof ("href =")) "href =" ret.substring (Ret.indexof ("href =") 5);
}
While (ret.indexof ("href =")> = 0)
{
Ret = ret.substring (0, ret.indexof ("href =")) "href =" ret.substring (Ret.indexof ("href =") 5);}
While (ret.indexof ("href = ')> = 0)
{
Ret = ret.substring (0, ret.indexof ("href = ')) " href = / " ret.substring (Ret.indexof (" href =') 6);
}
}
TMPallstr = RET;
INT begin_nums = tmpallstr.indexof ("href =");
While (begin_nums> = 0)
{
String tmpstra = "";
String tmpstrb = tmpallstr.substring (begin_nums 5, 1);
IF (tmpstrb == "/")
{
END_N1 = Begin_NUMS 6;
IF ((end_n1 1)> TMPallStr.length)
{
""; "
}
Tmpstra = tmpallstr.substring (begin_nums 6, 1);
}
Else
{
END_N1 = Begin_NUMS 5;
TmpStra = TmpStrb;
}
IF (tmpstra == "#")
{
TMPallstr = tmpallstr.substring (end_n1);
Begin_nums = tmpallstr.indexof ("href =");
}
Else
{
End_nums1 = tmpallstr.indexof ("", END_N1);
End_nums2 = tmpallstr.indexof (">", end_n1);
End_nums3 = tmpallstr.indexof (" a", end_nums2);
IF ((End_nums3> = 0) && (end_nums2> = 0)))
{
Retitle = tmpallstr.substring (end_nums2 1, end_nums3 - end_nums2 - 1);
IF (End_nums1> End_nums2)
{
END_NUMS = end_nums2;
}
Else
{
IF (End_nums1 <0)
{
END_NUMS = end_nums2;
}
Else
{
END_NUMS = END_NUMS1;
}
}
String str4 = tmpallstr.substring (end_nums-1, end_nums - end_nums 1);
IF (str4 == "/" || str4 == "')
{
END_NUMS = END_NUMS - 1;
}
String stotalone = tmpallstr.substring (end_n1, end_nums - end_n1);
IF (Stotalone.Indexof ("http: //") <0)
{
IF (Stotalone.indexof ("/") == 0)
{
STOTALONE = URLCH1 STOTALONE;
}
Else
{
INT linshiintnum = 0;
INT FLAGS = 0;
String urlchange = urlstr ;;
While (stotalone.indexof ("../")> = 0)
{
STOTALONE = STOTALONE.SUBSTRINE (STOTALONE.INDEXOF ("../") 3);
LINSHIINTNUM = LinshiintNum 1;
Flags = flags 1;
}
While ((Urlchange.lastIndexof ("/")> = 0) && (linshiintnum> = 0)))
{
Urlchange = urlchange.substring (0, urlchange.lastIndexof ("/"));
LINSHIINTNUM = LinshiintNum - 1;
}
IF (Flags == 0)
{
STOTALONE = URLCH2 "/" STOTALONE;
}
Else
{
STOTALONE = URLChange "/" STOTALONE;
}
}
}
Reutstr = Reutstr new publicfun (). Removehtmlcode (RetiTle) STOTALONE
TMPallstr = tmpallstr.substring (end_nums3 4);
Begin_nums = tmpallstr.indexof ("href =");
}
Else
{
Begin_nums = -1;
}
}
}
RETURN Reutstr;
}
Catch (Exception E)
{
""; "
}
}
After getting the URL to grab the content, process the page:
/ / Get link content and classify
Public String GetWebContent (String Gatherurl, String Suburl, String Subtitle, String B_Content, String E_CONTENT, STRING B_FILTER, STRING E_FILTER, STRING ROOT)
{
String tmpallstr = ""
String dfstrb = "";
String dfstre = "";
String repicstr = ""; // picture return path
String recontentstr = "";
String Pichtml = "Images"; // local image path
String urlch1 = ""; string urlch2 = "";
INT POS1 = GATHERURL.INDEXOF (".");
INT POS2 = GATHERURL.lastIndexof ("/");
IF (POS1 <0)
{
""; "
}
IF (POS2 <0)
{
""; "
}
INT POS3 = GATHERURL.INDEXOF ("/", POS1);
IF (POS3 <0)
{
URLCH1 = GATHERURL;
URLCH2 = GATHERURL;
}
Else
{
URLCH1 = Gatherurl.Substring (0, POS3);
URLCH2 = Gatherurl.Substring (0, POS2);
}
TMPallstr = new publicfun (). get_http (Suburl, Time1);
// release source
String Docromstr = ""
IF (dfstrb! = "" && dfstre! = ")
{
IF (tmpallstr! = "")
{
INT B_DOCF = TMPallStr.Indexof (DFSTRB);
IF (b_docf> 0)
{
INT E_DOCF = TmPallStr.indexof (DFSTRE, B_DOCF DFSTRB.LENGTH);
IF (e_docf> 0 && e_docf> b_docf && e_docf - b_docf <20)
{
Docfromstr = tmpallstr.substring (b_docf dfstrb.length, e_docf - b_docf - dfstrb.length);
}
}
}
}
// Take content
IF (tmpallstr! = "")
{
Int begin_strnum = tmpallstr.indexof (b_content);
IF (Begin_Strnum <0)
{
""; "
}
INT end_strnum = tmpallstr.indexof (e_content, begin_strnum b_content.length);
IF (End_Strnum <0)
{
""; "
}
String stotalsubm = ""
IF (End_Strnum> Begin_Strnum)
{
STOTALSUBM = TMPallstr.substring (begin_strnum, end_strnum - begin_strnum);
}
IF (STOTALSUBM == "")
{
""; "
}
// Filter useless information
Int bfnum = numberSubm.indexof (b_filter);
IF (bfnum> -1)
{
INT EFNUM = STOTALSUBM.INDEXOF (E_FILTER, BFNUM);
IF (EFNUM> -1)
{
IF (EFNUM> BFNUM) {
STOTALSUBM = STOTALSUBM.SUBSTRING (0, BFNUM) STOTALSUBM.SUBSTRING (EFNUM E_FILTER.LENGTH);
}
}
}
/ / Format the picture mark
WHILE (STOTALSUBM.INDEXOF ("src =")> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ")) " SRC = " STOTALSUBM.SUBSTRING (SRC =" 4);
}
WHILE (STOTALSUBM.INDEXOF ("src =")> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ")) " SRC = " STOTALSUBM.SUBSTRING (SRC =" 4);
}
While (STOTALSUBM.INDEXOF ("src = '")> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, SRC = ')) "SRC = /" " STOTALSUBM.SUBSTRING (SRC =') 5);
}
// Take a picture address
INT END_N12 = 0;
INT END_NUMS2 = 0;
INT begin_nums2 = numberSubm.indexof ("src =");
While (begin_nums2> = 0)
{
String tmpstr = numberSubm.substring (begin_nums2 4, 1);
IF (tmpstr == "/")
{
END_N12 = Begin_NUMS2 5;
}
Else
{
END_N12 = Begin_NUMS2 4;
}
Int end_nums2a = numberSubm.indexof ("", end_n12);
Int end_nums2b = numberSubm.indexof (">", end_n12;
IF (end_nums2b <0)
{
Break;
}
IF (End_nums2a> End_nums2b)
{
END_NUMS2 = END_NUMS2B;
}
Else
{
IF (End_nums2a <0)
{
END_NUMS2 = END_NUMS2B;
}
Else
{
END_NUMS2 = End_nums2a;
}
}
Tmpstr = STOTALSUBM.SUBSTRING (End_NUMS2-1, 1);
IF (tmpstr == "/" || tmpstr == "') {
END_NUMS2 = END_NUMS2 - 1;
}
String tmpicstr = numberSubm.substring (End_n12, end_nums2 - end_n12);
IF (TMPPCSTR.INDEXOF ("http: //") <0)
{
IF (TMPPICSTR.INDEXOF ("/") == 0)
{
TMPPICSTR = URLCH1 TMPPICSTR;
}
Else
{
INT linshiintnum = 0;
INT FLAGS = 0;
String Urlchange = Suburl;
While (TMPPICSTR.INDEXOF ("../")> = 0)
{
Tmpicstr = tmpicstr.substring (TMPPCSTR.INDEXOF ("../") 3);
LINSHIINTNUM = LinshiintNum 1;
Flags = flags 1;
}
While ((Urlchange.lastIndexof ("/")> = 0) && (linshiintnum> = 0)))
{
Urlchange = urlchange.substring (0, urlchange.lastIndexof ("/"));
LINSHIINTNUM = LinshiintNum - 1;
}
IF (Flags == 0)
{
TMPPICSTR = URLCH2 "/" TMPPICSTR;
}
Else
{
TMPPICSTR = URLChange "/" TMPPICSTR;
}
}
}
// TmpPCSTR = TmpCStr.tolower ();
String TmpicsTRTMP = TMPPICSTR.TOLOWER ();
// if (TMPPCSTR.INDEXOF (".jpg")> 0 || TMPPCSTR.INDEXOF (".gif")> 0 || TMPPCSTR.INDEXOF (".bmp")> 0)
IF (TMPPICSTRTMP.INDEXOF (".jpg")> 0 || TMPPICSTRTMP.INDEXOF (".gif")> 0 || TMPPICSTRTMP.INDEXOF (".bmp")> 0)
{
REPICSTR = Repicstr "||" TMPPICSTR;
INT flagn2 = tmpicstr.lastindexof ("/");
String filen2 = pichtml tmpicstr.substring (flagn2);
STOTALSUBM = STOTALSUBM.SUBSTRING (0, end_nums2) "> ******" filen2 "****** <" numberSubm.substring (end_nums2); begin_nums2 = numberSubm.indexof ("src =" , END_NUMS2 Filen2.Length 22);
}
Else
{
Begin_nums2 = numberSubm.indexOf ("src =", end_nums2 4);
}
}
IF (repicstr.length> 2)
REPICSTR = repicstr.substring (2);
// Content processing formatting key tag
While (StotalSubm.indexof ("
= 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
While (StotalSubm.indexof ("
= 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
While (stotalsubm.indexof (" p")> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF (" p")) "| **** | <" numberSubm.substring (STOTALSUBM.INDEXOF (" p") 3);
}
While (stotalsubm.indexof (" p")> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF (" p")) "| **** | <" numberSubm.substring (STOTALSUBM.INDEXOF (" p") 3);
}
While (stotalsubm.indexof ("
= 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
While (stotalsubm.indexof ("
= 0) {
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
While (stotalsubm.indexof ("
= 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
While (stotalsubm.indexof ("
= 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF ("
}
// Remove HTML tag
INT Linshiint1 = STOTALSUBM.INDEXOF ("<");
INT Linshiint2 = STOTALSUBM.INDEXOF (">");
IF (linshiint2
{
STOTALSUBM = STOTALSUBM.SUBSTRING (linshiint2 1);
}
INT Linshiint11 = STOTALSUBM.LastIndexof ("<");
INT Linshiint12 = numberSubm.lastIndexof (">");
IF (linshiint12
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, LinshiINT12 1);
}
LINSHIINT1 = STOTALSUBM.INDEXOF ("<");
While (linshiint1> = 0)
{
LINSHIINT2 = STOTALSUBM.INDEXOF (">", linshiint1);
IF (linshiint2> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, Linshiint1) STOTALSUBM.SUBSTRING (linshiint2 1);
}
Else
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, linshiint1);
}
LINSHIINT1 = STOTALSUBM.INDEXOF ("<");
}
// Restore key mark
INT linshiint3 = 0;
INT linshiint4 = 0;
While (stotalsubm.indexof (" **** ")> = 0) {
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF (" **** ")) "
/ n" numberSubm.Substring (STOTALSUBM.INDEXOF (" **** ") 9 );
}
While (stotalsubm.indexof ("| **** |")> = 0)
{
STOTALSUBM = numberSubm.substring (0, STOTALSUBM.INDEXOF ("| **** |")) "
/ n" numberSubm.substring ("| **** |") 9 );
}
While (stotalsubm.indexof ("******")> = 0)
{
LINSHIINT3 = STOTALSUBM.INDEXOF ("******") 9;
LINSHIINT4 = STOTALSUBM.INDEXOF ("******", linshiint3);
IF (linshiint4> = 0)
{
INT TMPPOS = STOTALSUBM.INDEXOF ("******");
String tmpstr1 = numberSubm.substring (0, TMPPOS);
String Tmpstr2 = STOTALSUBM.SUBSTRING (Linshiint3, Linshiint4 - linshiint3);
String Tmpstr3 = STOTALSUBM.SUBSTRING (Linshiint4 9);
STOTALSUBM = TmpStr1 "
" tmpstr3;
}
Else
{
Break;
}
}
// Remove the title in the content
IF (STOTALSUBM.INDEXOF (SUBTITE)> = 0)
{
STOTALSUBM = STOTALSUBM.SUBSTRING (0, STOTALSUBM.INDEXOF (SUBTITLE)) STOTALSUBM.SUBSTRING (STOTALSUBM.INDEXOF (Subtitle) Subtitle.length;
}
Recontentstr = numberSubm;
// Call Download Image Features
// Download the picture to the specified directory
String [] img_url = new publicfun (). split (repicstr, "||");
For (int i = 0; i
{
IF (IMG_URL [I]! = "")
{
New publicfun (). get_img (IMG_URL [I], 1000, Root "// images //" IMG_URL [i] .substring (img_url [i] .lastIndexof ("/") 1));}
}
}
RETURN RecontentStr;
}
The above method returns the information to be obtained, including the title content, the image address, and the like.
Download page Picture:
// Download picture
Public void get_img (String a_strurl, int Timeout, String Filepath)
{
Try
{
Httpwebrequest myreq = (httpwebrequest) httpwebrequest.create (a_strurl);
MyReq.Timeout = Timeout;
HttpWebResponse httpwresp = (httpwebresponse) MyReq.getResponse ();
Stream MyStream = httpwresp.getResponseStream ();
Bitmap Map = New Bitmap (MyStream);
PictureBox Picb = New Picturebox ();
Picb.Image = (image) MAP;
String path = filepath.substring (0, filepath.lastindexof ("//"));
IF (! Directory.exists (PATH))
{
CreateDir (PATH);
}
Picb.Image.save (filepath);
}
Catch (Exception Exp)
{
String ss = exp .Message;
Writelog (Filepath.Substring (0, FilePath.lastIndexof ("//")) "//rror.log" ,a_strull " - " SS " / R / N ");
}
}
l Save file or warehouse
The information obtained above can be saved according to your own requirements.
**** When designing, the URL is not used to crawl according to level, which defines the efficiency of the capture of the URL and is faster.
Test program download: http://bjfile.focus.cn/file/15379/netbugv102.rar
Please send email: xiancai@qianlong.com or msn yutao728@hotmail.com if you have any suggestion: xiancai@qianlong.com or msn yutao728@hotmail.com
Note: This version only provides a static file storage function, does not provide a database interface, and does not provide custom website features.
This program is running to install the .NET framework 1.1.