Using system.text; using system.net; using system.collections; using system.text.regularexpressions;
Public class app {public static void main () {string strcode; arraylist allinks; console.write ("Please enter a web address:"); string strurl = console.readline (); if (Strurl.Substring (0,7) ! @ "http: //") {strurl = @ "http: //" strunt;}
Console.writeline ("I am getting a page code, please wait ..."); strcode = getpagesource (strurl);
Console.writeline ("I'm extracted hyperlink, please wait ..."); allinks = geethiperlinks (strcode);
Console.writeline ("Write file, please wait ..."); WriteToxml (Strurl, Allinks);
/ / Get the HTML code static string getpagesource (String URL) {URI URI = New URI (URL);
HttpWebRequest hwreq = (httpwebrequest) WebRequest.create (URI); httpwebresponse hwres = (httpwebresponse) hw.getResponse ();
HWREQ.METHOD = "get";
HWREQ.keepalive = false;
StreamReader Reader = New StreamReader (HWRES.GETRESPONSSTREAM (), System.Text.Encoding.Getencoding ("GB2312");
Return Reader.ReadToend ();
/ / Extract the URL in the HTML code {arraylist al = new arraylist ();
String strregex = @ "http: // ([/ w-] /.) [/ w-] (/ [/ w- ./?%&=]*)?"
Regex r = new regex (strregex, regexoptions.ignorecase); matchcollection m = r.matches (htmlcode);
For (int i = 0; i <= m.count-1; i ) {bool rep = false; string strnew = m [i] .tostring ();
// Filter repetition URL Foreach (String Str in Al) {if (strnew == Str) {rep = true; Break;}}
IF (!) Al.Add (strnew);
Al.sort ();
Return al;}
// URL written to the xml file static void WriteToXml (string strURL, ArrayList alHyperLinks) {XmlTextWriter writer = new XmlTextWriter ( "HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument (false) Writer.writedoctype ("hyperlinks", null, "urls.dtd", null; Writer.writeComment ("Extract from" Strull "hyperlink"); Writer.WriteStartElement ("hyperlinks"); Writer.writestartElement ( "HyperLinks", null); writer.WriteAttributeString ( "DateTime", DateTime.Now.ToString ()); foreach (string str in alHyperLinks) {string title = GetDomain (str); string body = str; writer.WriteElementString ( Title, null, body;}
Writer.writeEndelement (); Writer.WriteEndelement ();
Writer.flush (); Writer.close ();
// Get the domain suffix of the URL static string getdomain (string strurl) {string return
String strregex = @ "(/. COM / | /.ORG/ | /.GOV/)";
Regex r = new regex (strregex, regexoptions.ignorecase); match m = r.match (strurl); retval = m.toString ();
Strregex = @ "/. | / $"; retval = regex.replace (retval, strregex, "") .tostring ();
IF (retval == ") RetVal =" taher ";
Return RetVal;}}