Crawler webpage with PHP
Author: creating a coalition Joined: 2003-11-24 Views: 322 crawl pages, and text and images into the database, using the database getimg.php id = read picture getarticle.php id =?? Read the document / ** Table Document Articles Corresponding Type 1: Oracle, 2: Java, 3: SystemCreate Table Article (ID INT (6) Not Null Auto_Increment, Title Varchar (80) Default Null, Content Text, URL varchar (80) default nULL, joindate varchar (12) default nULL, articletype int (2) not null, PRIMARY KEY (id)); CREATE TABLE images (id int (4) NOT nULL auto_increment, bin_data longblob, filetype varchar (50 ) Default null (50) Default Null, ArticleId Int (6) Not Null, Primary Key (ID)) type = myisam; * / Class SaveWeb {var $ TITLE; var $ url; var $ typeid; var $ content Var $ getURL = true; var $ getimg = "getimg.php? id =" var $ dbuser = "root"; var $ dbpassword = "WHF76128"; var $ dbname = "tech"; var $ dbhost = "127.0 .0.1 "; Function SaveWeb ($ TITLE, $ URL, $ TYPEID) // Initialization, {$ this-> Title = $ TITLE; $ this-> URL = $ URL; $ this-> typeid = $ typeId;} Function SetContent ($ html) // Initialization, {$ this-> Content = $ HTML; $ this-> getURL = false;} function saveconte NT () // Direct storage paragraph text {$ DATE = Gmdate ("ymd"); $ data = NL2BR ($ THIS-> Content); $ data = addslashes ($ data); mysql_connect ($ this-> dbhost, $ this-> dbuser, $ this-> dbpassword); mysql_select_db ($ this-> dbname); $ results = mysql_query ("INSERT INTO ARTICLE (Title, Content, URL, Joindate, Articles) VALUES ('$ this-> title' , '$ data', '$ this-> URL', '$ DATE', $ THIS-> TYPEID "); $ ID = mysql_insert_id (); mysql_close (); return $ ID;} Function Websave () // Storage page {IF ($ this-> title == "|| $ this-> url =="
") Return False; if ($ this-> geturl == true) $ text = $ this-> gethtml ($ this-> URL); else {$ text = $ this-> content;} $ text2 = $ this- > Parserhtml ($ text); $ ID = $ this-> Savehtml ($ text2); $ this-> UpdateImgpid ($ ID, $ this-> title); $ this-> Delimg (); return $ ID;} / / Find $ STRCHILD in $ STROBJ, return value is location (found) and false (not finding the corresponding string). Function strfind ($ strobj, $ strchild, $ int) {$ intobj = Strlen ($ strobj); $ INTCHILD = STRLEN ($ STRCHILD); While ($ INT <= $ INTOBJ) {IF (STRTOLOWER (Substr ($ Strobj, $ Int, 1)) == $ STRCHILD [0]) // Intercepted from $ STROBJ The first character is further judged by the same character as the first character of $ STRCHILD. {IF (SUBSTR (SUBSTR ($ Strobj, $ Int, $ INTCHILD) == $ strchild) Return $ int;} $ int ;} Return False;} Function gethtml ($ URL) {IF ($ fp = fopen ($ URL, "R")) == false) {echo " read failed, File Location: $ URL font> < Br> "; return false;} $ data =" "; while (! Feof ($ fp)) {$ data = $ data.fread ($ fp, 512);} fclose ($ fp); return $ data;} function delImg () {mYSQL_CONNECT ($ this-> dbhost, $ this-> dbuser, $ this-> dbpassword); mysql_select_db ($ this-> dbname); $ result = MYSQL_QUERY ( "delete from images where articleid = 0") MSQL_Close (); function updateImgPID ($ id, $ title) {MYSQL_CONNECT ($ this-> dbhost, $ this-> dbuser, $ this-> dbpassword); mysql_select_db ($ this-> dbname); MYSQL_QUERY ( "update images set articleid = $ id WHERE TITLE = '$ TITLE' "); mysql_close ();} Function Savehtml ($ DATA) {$ DATE = Gmdate (" YMD "); $ data = addslashes ($ data); mysql_connect ($ this-> dbhost, $ This-> dbuser, $ this-> dbpassword); mysql_select_db ($ this-> dbname); $ results = mysql_query ("
Insert Into Article (Title, Content, URL, Joindate, Articles) VALUES ('$ this-> Title', '$ data', '$ this-> url', '$ date', $ this-> typeid ") $ ID = mysql_insert_id (); mysql_close (); returni ID;} Function SaveImg ($ URL) {$ data = $ this-> gethtml ($ URL); $ data = addslashes ($ data); mysql_connect ($ THIS -> DBHOST, $ this-> dbuser, $ this-> dbpassword); mysql_select_db ($ this-> dbname); $ results = mysql_query ("INSERT INTO images (bin_data, fileetype, title, articleid) VALUES ('$ data' , '"$ this-> getContentType ($ URL)."', '$ this-> title', 0) "); $ ID = mysql_insert_id (); mysql_close (); return $ ID;} Function getContentName ($ InfileName) {RETURN BASENAME ($ INFILENAME);} Function getContentType ($ INFILENAME) {// - Strippath $ INFILENAME = BaseName ($ INFILENAME); // - Check file extension IF (Strrchr ($ InfileName, " ") == false) {Return" Application / OCTET-Stream ";} // - Get the file extension and determine the file type $ extension = Strrchr ($ InfileName,"); Switch ($ extension) { Case ".gif": return "image / gif"; case ".gz": return "Application / X-gzip"; Case ".htm": return "text / html"; case ".html" : Return "text / html"; case ".jpg": return "image / jpeg"; case ".tar": return "Application / X-tar"; case ".txt": return "text / plain"; casse ".zip": return "Application / Zip"; Case ".png": return "image / png"; case ".bmp": return "image / bmp"; default: return "Application / OcT-stream"; Return "Application / OcTet-stream";} Function Parserhtml ($ text) {$ int = 0; $ baseURL =
PARSE_URL ($ this-> URL); $ urlhost = "http: //". $ baseURL ["Host"]; $ urldir = $ urlhost.dirname ($ baseURL ["Path"]); $ urldir = str_replace (" // "," / ", $ urldir); // Update Tag While ($ INT = $ THIS-> Strfind ($ TEXT," Strfind ($ TEXT, ">", $ Int); $ TMPTXT = Substr ($ TEXT, $ INT, $ Closecharpos- $ Int 1); $ srcstart = $ this-> strfind ($ TMPTXT, "SRC =", 0); $ srcend = 0; Switch (Substr ($ TMPTXT, $ SRCSTART 4, 1)) {Case '": $ srcend = $ this-> Strfind ($ TMPTXT,'", $ srcstart 5) $ IMGURL = SUBSTR ($ TMPTXT, $ SRCSTART 5, $ srcend- $ srcstart-5); Break; Case "'": $ srcend = $ this-> Strfind ($ TMPTXT, "'", $ srcstart 5 $ IMGURL = SUBSTR ($ TMPTXT, $ SRCSTART 5, $ srcend- $ srcstart-5); Break; Default: $ srcend = $ this-> strfind ($ tmptxt, ", $ srcstart 4); if ($ srcend == false) $ srcend = $ THIS-> Strfind ($ TMPTXT, '> "$ srcstart 4); $ IMGURL = Substr ($ TMPTXT, $ SRCSTART 4, $ srcend- $ srcstart-4) } $ TEMPIMGURL = $ IMGURL; $ TEMPFILE = PARSE_URL ($ this-> getimg); if ($ THIS-> Strfind ($ TMPTXT, "http: //", 0)! = true) {switch (Substr ($ TRUE) {Switch Imgurl, 0,1)) {CASE "/": $ IMGURL = $ URLHOST. $ I Mgurl; Break; Default: IF (Substr ($ URLDIR, Strlen ($ URLDIR) -1, 1) == "/" $ IMGURL = $ URLDIR. $ IMGURL; ELSE $ IMGURL = $ URLDIR. "/". IMGURL;}} IF ($ this-> Strfind ($ IMGURL, $ TEMPFILE ["Path"], 0)! = false) {$ int ; continue;} $ ID = $ this-> saveimg ($ imp); if ($ Id ; Continue;} $ newimgurl = $ this-> getimg. $ ID; $ text = str_replace ($ TEMPIMGURL, $ NewImgurl, $ text);