Extract hypertext in a web page (C #)

xiaoxiao2021-03-06  41

Using system.text; using system.net; using system.collections; using system.text.regularexpressions;

Public class app {public static void main () {string strcode; arraylist allinks; console.write ("Please enter a web address:"); string strurl = console.readline (); if (Strurl.Substring (0,7) ! @ "http: //") {strurl = @ "http: //" strunt;}

Console.writeline ("I am getting a page code, please wait ..."); strcode = getpagesource (strurl);

Console.writeline ("I'm extracted hyperlink, please wait ..."); allinks = geethiperlinks (strcode);

Console.writeline ("Write file, please wait ..."); WriteToxml (Strurl, Allinks);

/ / Get the HTML code static string getpagesource (String URL) {URI URI = New URI (URL);

HttpWebRequest hwreq = (httpwebrequest) WebRequest.create (URI); httpwebresponse hwres = (httpwebresponse) hw.getResponse ();

HWREQ.METHOD = "get";

HWREQ.keepalive = false;

StreamReader Reader = New StreamReader (HWRES.GETRESPONSSTREAM (), System.Text.Encoding.Getencoding ("GB2312");

Return Reader.ReadToend ();

/ / Extract the URL in the HTML code {arraylist al = new arraylist ();

String strregex = @ "http: // ([/ w-] /.) [/ w-] (/ [/ w- ./?%&=]*)?"

Regex r = new regex (strregex, regexoptions.ignorecase); matchcollection m = r.matches (htmlcode);

For (int i = 0; i <= m.count-1; i ) {bool rep = false; string strnew = m [i] .tostring ();

// Filter repetition URL Foreach (String Str in Al) {if (strnew == Str) {rep = true; Break;}}

IF (!) Al.Add (strnew);

Al.sort ();

Return al;}

// URL written to the xml file static void WriteToXml (string strURL, ArrayList alHyperLinks) {XmlTextWriter writer = new XmlTextWriter ( "HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument (false) Writer.writedoctype ("hyperlinks", null, "urls.dtd", null; Writer.writeComment ("Extract from" Strull "hyperlink"); Writer.WriteStartElement ("hyperlinks"); Writer.writestartElement ( "HyperLinks", null); writer.WriteAttributeString ( "DateTime", DateTime.Now.ToString ()); foreach (string str in alHyperLinks) {string title = GetDomain (str); string body = str; writer.WriteElementString ( Title, null, body;}

Writer.writeEndelement (); Writer.WriteEndelement ();

Writer.flush (); Writer.close ();

// Get the domain suffix of the URL static string getdomain (string strurl) {string return

String strregex = @ "(/. COM / | /.ORG/ | /.GOV/)";

Regex r = new regex (strregex, regexoptions.ignorecase); match m = r.match (strurl); retval = m.toString ();

Strregex = @ "/. | / $"; retval = regex.replace (retval, strregex, "") .tostring ();

IF (retval == ") RetVal =" taher ";

Return RetVal;}}

转载请注明原文地址:https://www.9cbs.com/read-74378.html

New Post(0)