-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
initial and add Baidu web spider Larvae
- Loading branch information
0 parents
commit af6d544
Showing
18 changed files
with
1,291 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/usr/bin/python | ||
import re | ||
import cookielib | ||
import urllib | ||
import urllib2 | ||
|
||
# build cookies jar | ||
cj = cookielib.CookieJar() | ||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | ||
urllib2.install_opener(opener) | ||
|
||
# get cookie | ||
MainUrl = "http://www.baidu.com/" | ||
res = urllib2.urlopen(MainUrl) | ||
for index, cookie in enumerate(cj): | ||
print '[',index,']',cookie | ||
|
||
# get tooken | ||
getTokenUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true" | ||
res = urllib2.urlopen(getTokenUrl) | ||
Html = res.read() | ||
tokenStr = "bdPass.api.params.login_token='(.*?)';" | ||
tokenObj = re.compile(tokenStr,re.DOTALL) | ||
matchObj = tokenObj.findall(Html) | ||
for index, cookie in enumerate(cj): | ||
print '[',index, ']',cookie; | ||
if matchObj: | ||
token = matchObj[0] | ||
print matchObj[0] | ||
|
||
# Login Baidu | ||
LoginUrl = "https://passport.baidu.com/v2/api/?login" | ||
postDict = { | ||
'charset':"utf-8", | ||
'token':token, | ||
'isPhone':"false", | ||
'index':"0", | ||
'staticpage':"https://passport.baidu.com/static/passpc-account/html/v3Jump.html", | ||
'logintype':"basicLogin", | ||
'tpl':"mn", | ||
'callback':"parent.bd__pcbs__b0pp2q", | ||
'username':"[email protected]", | ||
'password':"220496", | ||
'mem_pass':"on", | ||
} | ||
postData = urllib.urlencode(postDict); | ||
print postData | ||
req = urllib2.Request(LoginUrl,postData) | ||
req.add_header('Content-Type', "application/x-www-form-urlencoded") | ||
res = urllib2.urlopen(req) | ||
|
||
# handle the baidu music web | ||
Html = res.read() | ||
print Html | ||
for index, cookie in enumerate(cj): | ||
print '[',index, ']',cookie; | ||
urlStr = "encodeURI\('(.*?)'\);" | ||
urlObj = re.compile(urlStr,re.DOTALL) | ||
matchUrl = urlObj.findall(Html) | ||
if matchUrl: | ||
returnUrl = matchUrl[0] | ||
#req = urllib2.Request(returnUrl) | ||
#res = urllib2.urlopen(req) | ||
#print res.read() | ||
songUrl = "http://music.baidu.com/song/120948904/download" | ||
req = urllib2.Request(songUrl) | ||
res = urllib2.urlopen(req) | ||
print res.read() | ||
# TODO: get the download link |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#include "hash.h" | ||
#include "stdafx.h" | ||
|
||
//外部函数 | ||
inline size_t HashTableOffset(size_t f1, size_t f2_f1, size_t t){ | ||
return (f1 * (1 << 12) + f2_f1 * (1 << 6) + t); | ||
} | ||
/* | ||
size_t* ValueTableAddr(size_t* start, HashKeyInfo* pKey){ | ||
return start + pKey->start + pKey->length; | ||
} | ||
*/ | ||
//类方法 | ||
/************************************ Functions for build tracks.(iFlyBuild) ************************************************/ | ||
//此函数用于对于iFlyBuild的情况,从wav到Hash_Table. //Finished. | ||
void THash::BuildInit(){ | ||
//Initialize memory value. | ||
pValueStart = (size_t*)malloc(sizeof(size_t) * HashKeyNum * ValuePerBlock * BlockNum); | ||
if (pValueStart==NULL) | ||
printf("error"); | ||
memset(pValueStart, 0, sizeof(size_t) * HashKeyNum * ValuePerBlock * BlockNum); | ||
//连续内存中,初始化已分配的结尾地址。 | ||
pValueEnd = pValueStart + sizeof(size_t) * ValuePerBlock; | ||
//初始化HashKey表 | ||
key_info = (HashKeyInfo*)malloc(sizeof(HashKeyInfo) * HashKeyNum); | ||
for (int i = 0; i < HashKeyNum; i++){ | ||
key_info[i].next = nullptr; | ||
key_info[i].length = 0; | ||
} | ||
} | ||
|
||
void THash::BuildUnInit(){ | ||
for (int i=0; i<HashKeyNum; i++){ | ||
HashKeyInfo *p = key_info[i].next; | ||
HashKeyInfo *q; | ||
while(p){ | ||
q = p; | ||
p = p->next; | ||
free(q); | ||
} | ||
} | ||
free(key_info); | ||
free(pValueStart); | ||
} | ||
|
||
//加歌名,更新歌曲数。 | ||
void THash::AddSongList(const char *filename){ | ||
strcpy_s(song_list[song_num], strlen(filename) + 1, filename); | ||
song_num++; | ||
} | ||
|
||
//往Value的内存块里加数据,更新Key_table. | ||
void THash::InsertHash(size_t f1, size_t f2_f1, size_t t, size_t id, size_t offset){ | ||
//异常处理 | ||
if (id > (1 << ID_BITS) - 1 || offset > (1 << OFFSET_BITS) - 1){ | ||
printf("The id/time offset overflow.\n"); | ||
return; | ||
} | ||
//当前key的地址 | ||
HashKeyInfo *pKey = &key_info[HashTableOffset(f1, f2_f1, t)]; | ||
if (pKey->length%ValuePerBlock==0){ | ||
/*在原来的连续内存中再申请一块*/ | ||
HashKeyInfo * pNode = (HashKeyInfo*)malloc(sizeof(HashKeyInfo)); | ||
/* 内存溢出 */ | ||
if ((pValueEnd - pValueStart) > OverFlowThreshold){ | ||
printf("Memory out.\n"); | ||
return; | ||
} | ||
pNode->start = pValueEnd; | ||
pValueEnd += ValuePerBlock*sizeof(size_t); | ||
pNode->length = 0; | ||
pNode->next = pKey->next; | ||
pKey->next = pNode; | ||
} | ||
size_t* pValue = pKey->next->start + pKey->next->length; | ||
*pValue = (size_t)((id << OFFSET_BITS) + offset); | ||
pKey->next->length++; | ||
pKey->length++; | ||
/* For File Write */ | ||
data_num++; | ||
} | ||
|
||
//将Hash表往文件里刷(不是刷整个内存,这样会在iFlySelect里浪费内存空间) | ||
void THash::Hash2File(const char* filename){ | ||
FILE *fp; | ||
fopen_s(&fp, filename, "wb"); | ||
if (fp == NULL){ | ||
printf("File open WRONG.\n"); | ||
} | ||
//Write SongName | ||
fwrite(&song_num, sizeof(size_t), 1, fp); | ||
printf("共%d首歌\n", song_num); | ||
for (size_t i = 0; i<song_num; i++) | ||
fwrite(song_list[i], sizeof(char), strlen(song_list[i]) + 1, fp); | ||
//Write hash table. | ||
fwrite(&data_num, sizeof(size_t), 1, fp); | ||
for (int i = 0; i<HashKeyNum; i++){ | ||
HashKeyInfo *p = key_info[i].next; | ||
while (p){ | ||
fwrite(p->start, sizeof(size_t), p->length, fp); | ||
p = p->next; | ||
} | ||
} | ||
size_t start_place = 0; | ||
for (int i = 0; i<HashKeyNum; i++){ | ||
fwrite(&start_place, sizeof(size_t), 1, fp); | ||
fwrite(&key_info[i].length, sizeof(size_t), 1, fp); | ||
start_place += key_info[i].length; | ||
} | ||
fclose(fp); | ||
} | ||
|
||
/************************************ Functions for select tracks.(iFlySelect) ************************************************/ | ||
|
||
void THash::File2Hash(const char *filename){ | ||
FILE *fp; | ||
fopen_s(&fp, filename, "rb"); | ||
char *chp; | ||
fread(&song_num, sizeof(size_t), 1, fp); | ||
printf("共%d首歌\n", song_num); | ||
for (size_t i = 0; i<song_num; i++){ | ||
chp = song_list[i]; | ||
do{ | ||
fread(chp, sizeof(char), 1, fp); | ||
} while (*(chp++) != 0); | ||
} | ||
fread(&data_num, sizeof(size_t), 1, fp); | ||
pValueStart = (size_t*)malloc(sizeof(size_t)* data_num); | ||
fread(pValueStart, sizeof(size_t), data_num, fp); | ||
key_table = (HashKeyTable*)malloc(sizeof(HashKeyTable)*HashKeyNum); | ||
fread(key_table, sizeof(size_t)* 2, HashKeyNum, fp); | ||
for (int i = 0; i<HashKeyNum; i++) | ||
key_table[i].start = (size_t)key_table[i].start + pValueStart; | ||
} | ||
|
||
void THash::ReBuildInit(){ | ||
vote_table = (short **)malloc(sizeof(short*)* song_num); | ||
for (size_t i = 0; i<song_num; i++){ | ||
vote_table[i] = (short *)malloc(sizeof(short)* (1 << OFFSET_BITS)); | ||
assert(vote_table[i] != NULL); | ||
} | ||
} | ||
|
||
void THash::VoteInit(){ | ||
for (size_t i = 0; i<song_num; i++){ | ||
memset(vote_table[i], 0, sizeof(short)* (1 << OFFSET_BITS)); | ||
} | ||
return; | ||
} | ||
|
||
void THash::Vote(size_t f1, size_t f2_f1, size_t t, size_t offset){ | ||
HashKeyTable *pKey = &key_table[HashTableOffset(f1, f2_f1, t)]; | ||
size_t length = pKey->length; | ||
while (length){ | ||
length--; | ||
size_t offset_value = (*(pKey->start + length) << ID_BITS) >> ID_BITS; | ||
if (offset_value < offset) | ||
continue; // 为失效投票,这种情况的投票结果为错的 | ||
vote_table[(*(pKey->start + length)) >> OFFSET_BITS][offset_value - offset]++; | ||
} | ||
return; | ||
} | ||
|
||
size_t THash::VoteResult(size_t &offset){ | ||
size_t result = 0; | ||
short max = -1; | ||
for (size_t i = 0; i < song_num; i++){ | ||
for (size_t j = 0; j < (1<<OFFSET_BITS); j++) | ||
if (vote_table[i][j] > max){ | ||
max = vote_table[i][j]; | ||
result = i; | ||
offset = j; | ||
//if (vote_table[i][j]) | ||
// printf("%d %d %d\n", i, j, vote_table[i][j]); | ||
} | ||
} | ||
return result; | ||
} | ||
|
||
THash::THash(){ | ||
pValueStart = nullptr; | ||
pValueEnd = nullptr; | ||
vote_table = nullptr; | ||
data_num = 0; | ||
song_num = 0; | ||
key_info = nullptr; | ||
song_list = (char **)malloc(MAX_SONG_NUM*sizeof(char*)); | ||
for (int i=0; i<MAX_SONG_NUM; i++) | ||
song_list[i] = (char *)malloc(MAX_SONG_LEN*sizeof(char)); | ||
} | ||
|
||
THash::~THash(){ | ||
for (int i=0; i<MAX_SONG_NUM; i++) | ||
free(song_list[i]); | ||
free(song_list); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#ifndef _HashFunc_h_ | ||
#define _HashFunc_h_ | ||
|
||
#include "stdlib.h" | ||
#include "memory.h" | ||
#include "stdio.h" | ||
#include "assert.h" | ||
#include "string.h" | ||
#include <iostream> | ||
|
||
#define ID_BITS 18 //基本一共可以存26万首歌(如果服务器内存足够大)。 | ||
#define OFFSET_BITS 14 //默认每首歌小于8.7分钟。 | ||
#define MAX_SONG_NUM (2<<18) | ||
#define MAX_SONG_LEN 256 | ||
#define HashKeyNum (1<<20) | ||
#define ValuePerBlock (1<<6) //每一块内存空间大小(64*sizeof(size_t)),用于存哈希桶中的多个value值,可扩容。 | ||
#define BlockNum 4 //Block值,用于动态扩容。 | ||
#define OverFlowThreshold 1<<28 | ||
using namespace std; | ||
|
||
//Hash的key类型,用于iFluBuild | ||
//(f1, f2_f1, t) | ||
struct HashKeyInfo{ | ||
size_t* start; | ||
size_t length; | ||
HashKeyInfo* next; //指针用于扩容 | ||
}; | ||
//Hash的key类型,用于iFluSelect | ||
struct HashKeyTable{ | ||
size_t *start; | ||
size_t length; | ||
}; | ||
|
||
class THash{ | ||
private: | ||
public: | ||
size_t *pValueStart; | ||
size_t *pValueEnd; | ||
short **vote_table; | ||
size_t data_num; | ||
char **song_list; | ||
size_t song_num; | ||
HashKeyInfo *key_info; | ||
HashKeyTable *key_table; | ||
|
||
THash::THash(); | ||
THash::~THash(); | ||
void THash::ReBuildInit(); | ||
/************************************ Functions for build tracks.(iFlyBuild) ************************************************/ | ||
//此函数用于对于iFlyBuild的情况,从wav到Hash_Table. //Finished. | ||
void THash::BuildInit(); | ||
void THash::BuildUnInit(); | ||
//加歌名,更新歌曲数。 //Finished. | ||
void THash::AddSongList(const char *filename); | ||
//往Value的内存块里加数据,更新Key_table. //Finished. | ||
void THash::InsertHash(size_t f1, size_t f2_f1, size_t t, size_t id, size_t offset); | ||
//将Hash表往文件里刷(不是刷整个内存,这样会在iFlySelect里浪费内存空间)//Finished | ||
void THash::Hash2File(const char* filename); | ||
|
||
/************************************ Functions for select tracks.(iFlySelect) ************************************************/ | ||
size_t* THash::GetHash(size_t f1, size_t f2_f1, size_t t); | ||
void THash::File2Hash(const char* filename); | ||
//Functions for vote and save the top voted id to QueryId. | ||
void THash::VoteInit(); | ||
void THash::Vote(size_t f1, size_t f2_f1, size_t t, size_t offset); | ||
size_t THash::VoteResult(size_t &offset); | ||
}; | ||
#endif // _HashFunc_h_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
======================================================================== | ||
动态链接库:iFlyBuild 项目概述 | ||
======================================================================== | ||
|
||
应用程序向导已为您创建了此 iFlyBuild DLL。 | ||
|
||
本文件概要介绍组成 iFlyBuild 应用程序的每个文件的内容。 | ||
|
||
|
||
iFlyBuild.vcxproj | ||
这是使用应用程序向导生成的 VC++ 项目的主项目文件,其中包含生成该文件的 Visual C++ 的版本信息,以及有关使用应用程序向导选择的平台、配置和项目功能的信息。 | ||
|
||
iFlyBuild.vcxproj.filters | ||
这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中,通过这种关联,在特定节点下以分组形式显示具有相似扩展名的文件。例如,“.cpp”文件与“源文件”筛选器关联。 | ||
|
||
iFlyBuild.cpp | ||
这是主 DLL 源文件。 | ||
|
||
此 DLL 在创建时不导出任何符号。因此,生成时不会产生 .lib 文件。如果希望此项目成为其他某个项目的项目依赖项,则需要添加代码以从 DLL 导出某些符号,以便产生一个导出库,或者,也可以在项目“属性页”对话框中的“链接器”文件夹中,将“常规”属性页上的“忽略输入库”属性设置为“是”。 | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
应用程序向导创建了下列资源: | ||
|
||
iFlyBuild.rc | ||
这是程序使用的所有 Microsoft Windows 资源的列表。它包括 RES 子目录中存储的图标、位图和光标。此文件可以直接在 Microsoft Visual C++ 中进行编辑。 | ||
|
||
Resource.h | ||
这是标准头文件,可用于定义新的资源 ID。Microsoft Visual C++ 将读取并更新此文件。 | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
其他标准文件: | ||
|
||
StdAfx.h, StdAfx.cpp | ||
这些文件用于生成名为 iFlyBuild.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。 | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
其他注释: | ||
|
||
应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。 | ||
|
||
///////////////////////////////////////////////////////////////////////////// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
//{{NO_DEPENDENCIES}} | ||
// Microsoft Visual C++ generated include file. | ||
// Used by iFlyBuild.rc | ||
// | ||
|
||
#define IDS_APP_TITLE 103 | ||
|
||
// жÔÏóµÄÏÂÒ»×éĬÈÏÖµ | ||
// | ||
#ifdef APSTUDIO_INVOKED | ||
#ifndef APSTUDIO_READONLY_SYMBOLS | ||
#define _APS_NEXT_RESOURCE_VALUE 101 | ||
#define _APS_NEXT_COMMAND_VALUE 40001 | ||
#define _APS_NEXT_CONTROL_VALUE 1000 | ||
#define _APS_NEXT_SYMED_VALUE 101 | ||
#endif | ||
#endif |
Oops, something went wrong.