



版權說明:本文檔由用戶提供并上傳,收益歸屬內容提供方,若內容存在侵權,請進行舉報或認領
文檔簡介
1、用 C 語言編寫一個網絡蜘蛛來搜索網上出現的電子郵件地址作者: zhoulifa來源:可能大家經常要去互聯網上搜索特定的內容,比如收集大量郵件地址,如果用google之類的搜索引擎是沒法實現這種特定功能的,所以用C 語言來寫一個吧。它的功能就是不斷去取得網絡上的頁面, 然后分析出網頁上出現的郵件地址保存下來。 象個蜘蛛一樣, 從網絡上一個網頁爬向另一個網頁,不停止地搜索郵件地址。即:分析程序運行時的參數, 把各網頁地址作為根節點加入到鏈表, 然后從鏈表頭開始處理各節點對整個鏈表的處理是先處理兄弟節點,流程圖如下:然后再處理各節點的子節點,流程圖如下:當然,這里采用了遞歸調用方法,處理子節點的數
2、據時和處理整個鏈表一樣循環處理就是了。/*關于本文檔 *filename:用 C 語言編寫一個網絡蜘蛛來搜索網上出現的電子郵件地址*purpose: 一個郵址搜索程序的雛形*wrote by)周立發愛好者Linux 知識傳播者SOHO 族 開發者最擅長 C 語言*date time:2006-08-31 21:00:00*Note:任何人可以任意復制代碼并運用這些文檔,當然包括你的商業用途* 但請遵循 GPL*Hope: 希望越來越多的人貢獻自己的力量,為科學技術發展出力*/程序在運行的過程中要建立一個樹形鏈表結構,結構圖如下:程序啟動時分析所帶參數, 把各參數加入到根網頁節點, 如果有多個參
3、數則這個根網頁有兄弟節點。然后從根節點開始處理這一級上各節點, 把各節點網頁上出現的網頁鏈接加到該節點的子節點上,處理完當前這一級后處理子節點這一級。當然這只是一個原理展示程序,并沒有進行優化。這個程序的main 函數流程圖如下:源代碼如下:#include <sys/>#include <sys/>#include <>#include <sys/>#include <>#include <>#include <>#include <>#include <>#include <&
4、gt;#include <>#define ACCEPT "*/*"#define ACCEPTLANGUAGE "zh-cn,zh;q="#define ACCEPTENCODING "gzip,deflate"#define ACCEPTCHARSET "gb2312,utf-8;q=,*;q="#define KEEPALIVE "300"#define CONNECTION "keep-alive"#define CONTENTTYPE "app
5、lication/x-www-form-urlencoded"#define MAXFILENAME 14#define DEBUG 1typedef struct webnode char * host;/*網頁所在的主機*/int port;/*網絡服務器所使用的端口*/char * dir;/*網頁所在的目錄*/char * page;/*網頁文件名*/char * file;/*本地保存的文件名*/char IsHandled;/*是否處理過*/struct webnode * brother;/* 兄弟節點鏈表指針*/struct webnode * child;/* 子節
6、點鏈表指針*/ WEBNODE;struct sockaddr_in server_addr;int sockfd = 0, dsend = 0, totalsend = 0, nbytes = 0, reqn = 0, i = 0, j = 0, ret = 0; struct hostent *host;char request409600 = "", buffer1024 = "", httpheader1024 = "" int FileNumber = 0;char e2 = "/"WEBNODE * N
7、odeHeader, * NodeTail, * NodeCurr;char * mapped_mem;int GetHost(char * , char * , char * , int * , char * ); /*/void AnalyzePage(WEBNODE *); /*/void AddInitNode(char *, char *, int, char * ); /*/void HandleInitNode(WEBNODE *); /*/void DisplayNode(WEBNODE *); /*/void HandOneNode(WEBNODE *); /*/void D
8、oneWithList(int); /*/void DoOnce(); /*/void ConnectWeb(void); /*/void SendRequest(void); /*/void ReceiveResponse(void); /*/void GetEmail(char * ); /*/void GetLink(char * ); /*/void GetBeforePos(char * , char * ); /*/void GetAfterPos(char * , char * ); /*/void AddChildNode(WEBNODE * , char * ); /*/vo
9、id GetAfterPosWithSlash(char * , char * ); /*/void GetMemory(char * , int ); /*/int IsExistWeb(WEBNODE * , char * , char * , int , char * ); /*/ void Rstrchr(char * , int , char * ); /*/int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptChar
10、set, char * KeepAlive, char * Connection, char * ContentType); /*/*功能:設置HTTP 協議頭內容的一些固定值*/int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType)memcpy(UserAgent, USERAGENT, strlen
11、(USERAGENT);memcpy(Accept, ACCEPT, strlen(ACCEPT);memcpy(AcceptLanguage, ACCEPTLANGUAGE, strlen(ACCEPTLANGUAGE); memcpy(AcceptEncoding, ACCEPTENCODING , strlen(ACCEPTENCODING); memcpy(AcceptCharset, ACCEPTCHARSET, strlen(ACCEPTCHARSET); memcpy(KeepAlive, KEEPALIVE, strlen(KEEPALIVE); memcpy(Connecti
12、on, CONNECTION, strlen(CONNECTION); memcpy(ContentType, CONTENTTYPE, strlen(CONTENTTYPE); return 0;/*功能:在字符串s 里搜索x 字符,并設置指針d 指向該位置*/void Rstrchr(char * s, int x, char * d)int len = strlen(s) - 1;while(len >= 0)if(x = slen) (*d) = s + len; return;len-;(*d) = 0;/*功能:連接一個網站服務器*/void ConnectWeb(void)
13、 /* connect to web server */* create a socket descriptor */if(sockfd=socket(PF_INET,SOCK_STREAM,0)=-1)fprintf(stderr," Socket Error:%sa ",strerror(errno);exit(1);/* bind address */bzero(&server_addr, sizeof(server_addr);= AF_INET;= htons(NodeCurr->port);= *(struct in_addr *)host->
14、;h_addr);/* connect to the server */if(connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr) = -1)fprintf(stderr, " Connect Error:%sa ", strerror(errno);exit(1);/*功能:向網站發送HTTP 請求*/void SendRequest(void) /* send my http-request to web server */ dsend = 0;totalsend =
15、 0;nbytes=strlen(request);while(totalsend < nbytes) dsend = write(sockfd, request + totalsend, nbytes - totalsend); if(dsend=-1) fprintf(stderr, " send error!%s ", strerror(errno);exit(0); totalsend+=dsend;fprintf(stdout, " Request.%d %d bytes send OK! ", reqn, totalsend);/*功能
16、:接收網站的HTTP 返回*/void ReceiveResponse(void) /* get response from web server */ fd_set writefds;struct timeval tival;int retry = 0;FILE * localfp = NULL;i=0; j = 0;_ReCeive:FD_ZERO(&writefds);= 10;= 0;if(sockfd > 0) FD_SET(sockfd, &writefds);else fprintf(stderr, " Error, socket is negat
17、ive! "); exit(0);ret = select(sockfd + 1, &writefds, NULL, NULL, &tival);if(ret =0 ) if(retry+ < 10) goto _ReCeive;if(ret <= 0) fprintf(stderr, " Error while receiving! "); exit(0);if(FD_ISSET(sockfd, &writefds) memset(buffer, 0, 1024); memset(httpheader, 0, 1024);if
18、(localfp = fopen(NodeCurr->file, "w") = NULL) if(DEBUG) fprintf(stderr, "create file '%s' error ", NodeCurr->file); return;/* receive data from web server */while(nbytes=read(sockfd,buffer,1)=1)if(i < 4) /*獲取HTTP 消息頭*/if(buffer0 = ' ' | buffer0 = '
19、39;)i+;else i = 0;memcpy(httpheader + j, buffer, 1); j+;else /*獲取HTTP 消息體*/fprintf(localfp, "%c", buffer0); /* print content on the screen */ d is: %s", +reqn, request);DoOnce();if(flag) fprintf(stdout, " The following is the response header: %s", httpheader);/*功能:從字符串src 中分
20、析出網站地址和端口,并得到文件和目錄*/int GetHost(char * src, char * web, char * file, int * port, char * dir) char * pA, * pB, * pC;int len;*port = 0;if(!(*src)return -1;pA = src;if(!strncmp(pA, "", strlen("")pA = src+strlen("");/* else if(!strncmp(pA, "", strlen("")
21、pA = src+strlen(""); */else return 1;pB = strchr(pA, '/');if(pB)len = strlen(pA) - strlen(pB);GetMemory(web, len);memcpy(*web), pA, len);if(*(pB+1)Rstrchr(pB + 1, '/', &pC);if(pC) len = strlen(pB + 1) - strlen(pC);else len = 0;if(len > 0) GetMemory(dir, len);memcpy(*
22、dir), pB + 1, len);if(pC + 1) len = strlen(pC + 1);GetMemory(file, len);memcpy(*file), pC + 1, len);else len = 1;GetMemory(file, len);memcpy(*file), e, len);else len = 1;GetMemory(dir, len);memcpy(*dir), e + 1, len);len = strlen(pB + 1);GetMemory(file, len);memcpy(*file), pB + 1, len);else len = 1;G
23、etMemory(dir, len);memcpy(*dir), e + 1, len);len = 1;GetMemory(file, len);memcpy(*file), e, len);elselen = strlen(pA);GetMemory(web, len);memcpy(*web), pA, strlen(pA);len = 1;GetMemory(dir, len);memcpy(*dir), e + 1, len);len = 1;GetMemory(file, len);memcpy(*file), e, len);pA = strchr(*web), ':
24、39;);if(pA)*port = atoi(pA + 1);else *port = 80;return 0;/*filename:*purpose: 用 C 語言編寫一個網絡蜘蛛來搜索網上出現的電子郵件地址*tidied by)周立發愛好者Linux 知識傳播者SOHO 族 開發者最擅長 C 語言*date time:2006-08-31 21:00:00*Note:任何人可以任意復制代碼并運用這些文檔,當然包括你的商業用途* 但請遵循 GPL*Thanks to: 廣東省 Linux 公共服務技術支持中心 */int main(int argc, char * argv)int Web
25、Port;char * WebHost = 0, * PageAddress = 0, * WebDir = 0;if(argc < 2) if(DEBUG) fprintf(stdout, "Command error, you should input like this: %s WebPageAddress1 WebPageAddress2 WebPageAddress3 .", argv0); exit(0);NodeHeader = NodeTail = NodeCurr = 0;5d:", FileNumber);DisplayNode(Node
26、Header); /* display every node */HandleInitNode(NodeHeader); /* handle every page */return 0;/*功能:分析網頁*/void AnalyzePage(WEBNODE * node)int fd;int flength = 0;fd = open(node->file, O_RDONL Y);if(fd = -1)goto _AnalyzeDone;flength = lseek(fd, 1, SEEK_END);write(fd, "0", 1);lseek(fd, 0, SE
27、EK_SET);mapped_mem = mmap(0, flength, PROT_READ, MAP_PRIVA TE, fd, 0); GetEmail(mapped_mem);GetLink(mapped_mem);close(fd);munmap(mapped_mem, flength);_AnalyzeDone:close(fd);node->IsHandled = 1;remove(node->file);/*功能:為根節點設置兄弟節點*/void AddInitNode(char * Host, char * Page, int Port, char * Dir)W
28、EBNODE * NewNode;char filenameMAXFILENAME + 1 = ""if(NodeHeader = NULL) NewNode = NodeHeader = (WEBNODE *)malloc(sizeof(WEBNODE);else NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE); memset(NewNode, 0, sizeof(WEBNODE);NewNode->host = (char *)malloc(strlen(Host) + 1);m
29、emset(NewNode->host, 0, strlen(Host) + 1);NewNode->page = (char *)malloc(strlen(Page) + 1);memset(NewNode->page, 0, strlen(Page) + 1);NewNode->dir = (char *)malloc(strlen(Dir) + 1);memset(NewNode->dir, 0, strlen(Dir) + 1);NewNode->file = (char *)malloc(MAXFILENAME + 1);memset(NewNo
30、de->file, 0, MAXFILENAME + 1);strcpy(NewNode->host, Host);strcpy(NewNode->page, Page);strcpy(NewNode->dir, Dir);sprintf(filename, "file%", FileNumber+);strcpy(NewNode->file, filename);NewNode->port = Port;NewNode->IsHandled = 0;NewNode->brother = 0;NewNode->child
31、 = 0;NodeTail = NewNode;/*功能:處理根節點信息*/void HandleInitNode(WEBNODE * node)WEBNODE * CurrentNode = 0;CurrentNode = node;if(CurrentNode)while(CurrentNode)if(CurrentNode->IsHandled = 0)HandOneNode(CurrentNode);if(DEBUG)fprintf(stdout, " Display.%5d:", FileNumber);DisplayNode(NodeHeader);/*d
32、isplayeverynode */CurrentNode = CurrentNode->brother;CurrentNode = node;while(CurrentNode)if(CurrentNode->child&&CurrentNode->child->IsHandled= 0)HandleInitNode(CurrentNode->child);CurrentNode = CurrentNode->brother;/*功能:顯示年有節點信息*/void DisplayNode(WEBNODE * NodeHeader)WEBNO
33、DE * TempNode;TempNode = NodeHeader;fprintf(stdout, " ");while(TempNode) if(!strcmp(TempNode->dir, "/") fprintf(stdout, " %s:%d%s%s => %s %d ", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page,"")?TempNode->page:"&
34、quot;, TempNode->file, TempNode->IsHandled);else fprintf(stdout, " %s:%d/%s/%s => %s %d ", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "")?TempNode->page:"", TempNode->file, TempNode->IsHandled);TempNode = TempNode
35、->brother;TempNode = NodeHeader;while(TempNode) if(TempNode->child) DisplayNode(TempNode->child); TempNode = TempNode->brother;/*功能:處理單個節點信息*/void HandOneNode(WEBNODE * node)char UserAgent1024 = "", Accept1024 = "", AcceptLanguage1024 = "", AcceptEncoding102
36、4 = "", AcceptCharset1024 = "", KeepAlive1024 = "", Connection1024 = "", ContentType1024 = ""NodeCurr = node;if(host=gethostbyname(NodeCurr->host)=NULL) /* get ip address by domain */if(DEBUG)fprintf(stderr,"Gethostname '%s' error, %s
37、 ", NodeCurr->host,strerror(errno);exit(1);GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */if(strcmp(NodeCurr->dir, "/")sprintf(request, "GET /%s/%s HTTP/ Host: %sUser-A
38、gent:%sAccept:%sConnection:%s",NodeCurr->dir,strcmp(NodeCurr->page,"")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);elsesprintf(request, "GET %s%s HTTP/ Host: %s User-Agent: %s Accept: %sConnection:%s",NodeCurr->dir,strcmp(NodeCu
39、rr->page,"")?NodeCurr->page:"",NodeCurr->host, UserAgent, Accept, Connection);DoneWithList(1);AnalyzePage(NodeCurr);/*功能:從字符串src 中分析出郵件地址保存到文件*/void GetEmail(char * src)char * pa, * pb, * pc, *pd;char myemail1024 = ""FILE * mailfp = NULL;if(mailfp = fopen("
40、;", "a+") = NULL)return;pa = src;while(pb = strchr(pa, '')GetBeforePos(pb, &pc);GetAfterPos(pb, &pd);if(pc && pd && (strlen(pc) > (strlen(pd) + 3)memset(myemail, 0, 1024);memcpy(myemail, pc, strlen(pc) - strlen(pd);if(strcmp(NodeCurr->dir,"/&
41、quot;)fprintf(mailfp,"%s",myemail,NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "")?NodeCurr->page:"");elsefprintf(mailfp,"%s",myemail,NodeCurr->host,NodeCurr->dir, strcmp(NodeCurr->page, "")?NodeCurr->page:""
42、;);if(*(pd + 1)pa = pd + 1;else break;else if(*(pb + 1)elsebreak;pa = pb + 1;fclose(mailfp);/*功能:從src 中找出前面的字母、數字等內含,即email 地址中 的前面部分*/void GetBeforePos(char * src, char * d)char * x;if(src - 1)x = src - 1;else *d = 0; return ;while(x)if(*x >= 'a' && *x <= 'z') x-; cont
43、inue;else if(*x >= 'A' && *x <= 'Z') x-; continue;else if(*x >= '0' && *x <= '9') x-; continue;else if(*x = '.' | *x = '-' | *x = '_') x-; continue;else break;x+;if(x) *d = x;else *d = 0;/*功能:從src 中找出后面的字母、數字等內含,即ema
44、il 地址中 的后面部分*/void GetAfterPos(char * src, char * d)char * x;if(src + 1)x = src + 1;else *d = 0; return ;while(x)if(*x >= 'a' && *x <= 'z') x+; continue;else if(*x >= 'A' && *x <= 'Z') x+; continue;else if(*x >= '0' && *x
45、 <= '9') x+; continue;else if(*x = '.' | *x = '-' | *x = '_') x+; continue;else break;if(x) *d = x;else *d = 0;/*功能:從src 中找出前面的字母、數字等內含,即一個網頁地址中主機名后面的部分*/void GetAfterPosWithSlash(char * src, char * d)char * x;if(src)x = src;else *d = 0; return ;while(x)if(*x >=
46、 'a' && *x <= 'z') x+; continue;else if(*x >= 'A' && *x <= 'Z') x+; continue;else if(*x >= '0' && *x <= '9') x+; continue;else if(*x = '.' | *x = '-' | *x = '_' | *x = '=') x+; cont
47、inue; else if(*x = ':' | *x = '/' | *x = '?' | *x = '&') x+; continue; else break;if(x) *d = x;else *d = 0;/*功能:為myanchor 分配len 大小的內存*/void GetMemory(char * myanchor, int len)if(!(*myanchor)(*myanchor) = (char *)malloc(len + 1);else(*myanchor) = (char *)realloc(vo
48、id *)(*myanchor), len + 1);memset(*myanchor), 0, len + 1);/*功能:從src 中分析出網頁鏈接,并加入到當前節點的子節點上*/void GetLink(char * src)char * pa, * pb, * pc;char * myanchor = 0;int len = 0;pa = src;do if(pb = strstr(pa, "href='")pc = strchr(pb + 6, ''');len = strlen(pb + 6) - strlen(pc);GetMemory(&myanchor, len);memcpy(myanchor, pb + 6, len);else i
溫馨提示
- 1. 本站所有資源如無特殊說明,都需要本地電腦安裝OFFICE2007和PDF閱讀器。圖紙軟件為CAD,CAXA,PROE,UG,SolidWorks等.壓縮文件請下載最新的WinRAR軟件解壓。
- 2. 本站的文檔不包含任何第三方提供的附件圖紙等,如果需要附件,請聯系上傳者。文件的所有權益歸上傳用戶所有。
- 3. 本站RAR壓縮包中若帶圖紙,網頁內容里面會有圖紙預覽,若沒有圖紙預覽就沒有圖紙。
- 4. 未經權益所有人同意不得將文件中的內容挪作商業或盈利用途。
- 5. 人人文庫網僅提供信息存儲空間,僅對用戶上傳內容的表現方式做保護處理,對用戶上傳分享的文檔內容本身不做任何修改或編輯,并不能對任何下載內容負責。
- 6. 下載文件中如有侵權或不適當內容,請與我們聯系,我們立即糾正。
- 7. 本站不保證下載資源的準確性、安全性和完整性, 同時也不承擔用戶因使用這些下載資源對自己和他人造成任何形式的傷害或損失。
最新文檔
- 中考語文作文預測范文6篇及題目
- 抖音商戶跨部門協作項目推進辦法
- 全球汽車零部件行業自動化生產技術發展趨勢報告
- 八大城市物流企業物流園區投資熱點與風險預測研究報告
- 2024-2025學年福建省三明市梅列區梅列、永安七上數學期末調研模擬試題含解析
- 北京十一學校2024年化學九上期末統考模擬試題含解析
- 2024-2025學年江蘇省無錫市河塘中學化學九年級第一學期期末質量檢測模擬試題含解析
- 重慶三峽學院《園林資源及應用》2023-2024學年第一學期期末試卷
- 藥店干貨知識培訓課件
- 共享出行信用評價體系構建與平臺運營效率提升2025報告
- 復式公寓分割協議書
- 海上風電運維船安全
- 生產經營單位事故隱患內部報告獎勵機制實踐與案例
- 2025年江西省金控科技產業集團社會招聘4人(第一批次)筆試參考題庫附帶答案詳解
- 菜園開墾種植合同協議
- 紡織品紗線疵點分析與處理考核試卷
- AI賦能下的護理專業教育與培訓革新
- 瓦楞紙板生產線操作機長培訓講義
- 2025電子病歷書寫基本規范
- 全年病蟲害防治明細表
- 林權林地轉租協議書
評論
0/150
提交評論