Jeg har lavet en meget simpel søge robot:
#include <windows.h>
#include <wininet.h>
#include <iostream>
#include <string>
#include <list>
std::list<std::string> UrlList;
const char *StriStr(const char *s1, const char *s2)
{
const char *t = s1, *n, *c;
while(*t)
{
n = t;
c = s2;
while(tolower(*n) == tolower(*c) && *c != '\\0' && *n != 0)
{
c++;
n++;
}
if(*c == '\\0')
return t;
t++;
}
return 0;
}
std::string GetBase(const char *aUrl)
{
std::string Url(aUrl);
std::string::size_type Pos1 = Url.find("//");
if(Pos1 != std::string::npos)
{
std::string::size_type Pos2 = Url.find('/', Pos1 + 2);
if(Pos2 != std::string::npos)
{
return Url.substr(0, Pos2 + 1);
}
}
return Url;
}
std::string GetBaseDir(const char *aUrl)
{
std::string Url(aUrl);
std::string::size_type Pos1 = Url.rfind('/');
std::string::size_type Pos2 = Url.find("//");
if(Pos1 != std::string::npos && Pos1 > (Pos2 + 2))
{
return Url.substr(0, Pos1 + 1);
}
return Url;
}
void FindUrl(const char *aBuffer, const char *aCurrentUrl)
{
const char *p = aBuffer;
while((p = StriStr(p, "<a href=")) != 0)
{
p += 8; // Skip <a href="
char Url[1024];
size_t idx;
for(idx = 0; *p != '>'; idx++, p++)
Url[idx] = *p;
Url[idx] = 0;
if(Url[0] == '"')
{
for(idx = 1; Url[idx] && Url[idx] != '"'; idx++)
Url[idx - 1] = Url[idx];
Url[idx - 1] = 0;
}
std::string T;
if(StriStr(Url, "http://") == Url)
{
std::cout << "Complete Url: " << Url << std::endl;
T = Url;
}
else if(Url[0] == '/')
{
T = GetBase(aCurrentUrl);
T += Url;
std::cout << "From root: " << T << std::endl;
}
else
{
T = GetBaseDir(aCurrentUrl);
T += Url;
std::cout << "From Current Dir: " << T << std::endl;
}
std::list<std::string >::iterator it;
bool Found = false;
for(it = UrlList.begin(); it != UrlList.end() && !Found; it++)
if(T == *it)
Found = true;
if(!Found)
UrlList.push_back(T);
}
}
void CheckUrl(HINTERNET aInternet, const std::string &aUrl)
{
HINTERNET Url = InternetOpenUrl(aInternet, aUrl.c_str(), 0, 0, INTERNET_FLAG_RAW_DATA, 0);
if(!Url)
{
std::cout << "Failed to open Url: " << aUrl << ", Error: " << GetLastError() << std::endl;
}
std::cout << "Try: " << aUrl << std::endl;
char *Buffer = new char [1024*1024];
DWORD ReadLen = 0, CurrentPos = 0;
do
{
InternetReadFile(Url, (void *)(Buffer + CurrentPos), 1024*1024 - CurrentPos, &ReadLen);
CurrentPos += ReadLen;
}
while(ReadLen);
Buffer[CurrentPos] = 0; // Buffer is now the complete web page
FindUrl(Buffer, aUrl.c_str() );
delete [] Buffer;
InternetCloseHandle(Url);
}
int main()
{
HINTERNET Internet = InternetOpen("My Url Browser", INTERNET_OPEN_TYPE_DIRECT, 0, 0, 0);
if(!Internet)
{
std::cout << "Failed to open internet: " << GetLastError() << std::endl;
return 1;
}
UrlList.push_back("http://udvikleren.dk/"); // The start point
std::list<std::string>::iterator it;
for(it = UrlList.begin(); it != UrlList.end(); it++)
{
CheckUrl(Internet, it->c_str());
}
InternetCloseHandle(Internet);
}
Parseren mangler en del, men man bør kunne få en idé til hvordan man kan gøre.