######################################################################################################################################################################################################################################################################################################## ############################################################## into a local # MS access database. though this script is simple and strait forward, # blog itself as linked by the owner's favor may indecate a kind of # relationship that help to form a map and show us a whole view of ciber community # farther Research Would Be Using Some Data Mining Algorithm Such As Apriori .... #### Though it is succeed in clawing a minimun of blog links, it Should Be Upgrade to # be a serious research. This Script Could Be Copied and Rewirte At Anyone's Will. # # Inir, initialization The first start URL is a relatively important clue # here I choose to be popular in the blog in sina, popular http://blog.sina .com.cn / m / xiaojingzi as an initialization URL # Objective To study Blog as a media contact relationship, study how interpersonal networks are spreading # ## 3-15-2006 ######### ######################################################################################################################################################################################################################################################################################################## #########
#read start Index and Total Number and search ranks of links from config file
$ index; $ TOTALNUMBER; $ RANKS; $ iHASH = 'NUN';
Use filehandle; $ fH = new filehandle "config.txt", "r"; if (defined $ fh) {$ s = $ fh-> getLine (); $ = 0 $ s; $ s = $ fh- > getLine (); $ TOTALNUMBER = 0 $ S; $ s = $ fh-> getLine (); $ ranks = 0 $ s; undef $ fh;}
#store Total Number To Current Number $ CURRENTTOURNUMBER = $ TOTALNUMBER;
#open database firstuse win32 :: odbc; $ dsn = "testdb";
$ TABLE = "Datastore"; # ====== Check if the database can be openedened Correctlyif (! ($ db = new win32 :: odbc ($ dsn))) {print "Error In Opening DSN /" $ DSN / "! / n"; print "error:". Win32 :: odbc :: error (). "/ n"; exit;} # begin search loop # fsearch ranks exist and total - pre number> Search Ranks Than Quitwhile ($ CURRENTTOURNUMBER - $ TOTALNUMBER) <$ ranks) {
# ====== SELECT All Fields from the given table $ sql = "SELECT * from $ TABLE WHERE INDEXNUM = $ index;";
#read url from database by the index if ($ db-> sql ($ sql)) {print "ERROR in SQL Query: /" $ SQL / "! / n"; Print "Error:". $ db-> error (). "/ n"; $ db-> close (); exit;} while ($ db-> fetchrow ()) {($ hash, $ url) = $ dB-> data ("hashcode", "URL ");} Print $ URL," / N ";
#begin to get Each Link Use LWP :: UseERAGENT; USE FileHandle; $ ua = lwp :: useERAGENT-> New; my $ req = http :: remove-> new (get => $ URL); $ = $ uA -> Request ($ REQ, "Temphtmlstorefile.txt"); if ($ RES-> is_Success) {Print "OK / N"; Use Html :: Treebuilder; My $ tree = html :: Treebuilder-> new (); $ tree-> parse_file ("temphtmlstorefile.txt"); foreach my $ a ($ Tree-> Look_down ('_ tag', 'a',)) {$ tag = $ A-> tag; if (defined $ A ) {
$ hrefvalue = $ A-> Attr ('href'); if (defined $ hrefvalue) {$ hrefvalue = lc $ hrefview; #pre check if this link is a blog link simply to see if it contact the word 'bloghrefview; / blog /; if (! Defined ($ &)) {next; #only the url contain blog can farther} $ _ = $ hrefview; / image / ; If ($ & EQ 'Image') {next; #only the Url Contain Blog Can Go FARTHER} $ _ = $ hrefview; // '/; if ($ & EQ' / ') {next; #only the url contain blog can go farther} #pre Check if this link is a blog link simply to see see if IT Contain The Word 'http://' $ _ = $ hrefvalue; / http: /; ################################################################################################################################################################################################################################################################### ########################### i ($ & EQ 'http: / / '
) {#Try to store this url here and increment the currenttotalnumber
# ................................. # ............... ................ print $ hrefvalue, "/ n"; use digest :: MD5 QW (MD5_HEX); $ hashcode = md5_hex ($ hrefview); $ sqllink = " Select * from $ TABLE WHERE Hashcode = '$ hashcode'; "; if ($ db-> sql ($ sqllink)) {print" Error In SQL Query: / "$ sql /"! / n "; print" error: "$ db-> error ()." / n "; $ db-> close (); exit;}} (defined $ ihash) {$ ihash = 'nun';} while ($ db-> fetchrow () ) {($ IHASH, $ ILINK) = $ dB-> Data ("Hashcode", "LINKNO");} Print $ ihash, "/ n"; #if links exist add the index to its link Fiel D #ELSE Store The Links and Total Number Increment 1 IF ($ iHASH EQ 'Nun') # store the href {$ istr = Sprintf ("% D", $ index); $ currenttotalNumber = $ CURRENTTOURNUMBER 1; $ sqlinsert = "INSERT INTO DATASTORE VALUES ($ Hashcode ',' $ HREFVALUE ',' $ Istr ')"; $ RC = $ DB-> SQL ($ SQLINSERT); DIE QQ (SQL Failed "$ SQLINSERT":):) , $ dB-> error (), QQ (N) if $ rc;
} Else #save the index {$ Istr = Sprintf ("% D", $ index); $ lyink = join ('', $ ilink, ",", $ utr); $ sqlupdate = "Update Datastore set linkno = ' $ ILINK 'Where Hashcode =' $ ihash '"; $ rc = $ db-> sql ($ sqlupdate); DIE QQ (SQL failed" $ sqlupdate ":): $ dB-> error (), qq (n) if $ rc;}} #if (defined $ hrefvalue)} #if (Defined $ A)
} #Forech MY $ A ($ Tree-> Look_down ('_ tag', 'a',)) $ Tree-> Delete;} #if ($ RES-> IS_SUCCESS ELSE {Print $ RES-> Status_Line, "/ N ";
#end getting each links
#start index increment $ index = $ index 1; #store Current Index and Total Number in To Config File
Use IO :: File; $ FILEHDLER = New IO :: File "> config.txt"; if (defined $ filehdler) {Print $ FILEHDLER $ INDEX, "/ N"; Print $ FILEHDLER $ CurrentTotalNumber, "/ N"; Print $ Filehdler $ RANKS, "/ N"; $ FILEHDLER-> Close;}} # end search loop
#Close Database $ db-> close ();
Print $ CurrentTotalNumber, "/ N";