Skip to content

Commit

Permalink
initial and add Baidu web spider Larvae
Browse files Browse the repository at this point in the history
  • Loading branch information
jhyume committed Jul 10, 2014
0 parents commit af6d544
Show file tree
Hide file tree
Showing 18 changed files with 1,291 additions and 0 deletions.
69 changes: 69 additions & 0 deletions BaiduMusicSpider/Spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/python
import re
import cookielib
import urllib
import urllib2

# build cookies jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)

# get cookie
MainUrl = "http://www.baidu.com/"
res = urllib2.urlopen(MainUrl)
for index, cookie in enumerate(cj):
print '[',index,']',cookie

# get tooken
getTokenUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true"
res = urllib2.urlopen(getTokenUrl)
Html = res.read()
tokenStr = "bdPass.api.params.login_token='(.*?)';"
tokenObj = re.compile(tokenStr,re.DOTALL)
matchObj = tokenObj.findall(Html)
for index, cookie in enumerate(cj):
print '[',index, ']',cookie;
if matchObj:
token = matchObj[0]
print matchObj[0]

# Login Baidu
LoginUrl = "https://passport.baidu.com/v2/api/?login"
postDict = {
'charset':"utf-8",
'token':token,
'isPhone':"false",
'index':"0",
'staticpage':"https://passport.baidu.com/static/passpc-account/html/v3Jump.html",
'logintype':"basicLogin",
'tpl':"mn",
'callback':"parent.bd__pcbs__b0pp2q",
'username':"[email protected]",
'password':"220496",
'mem_pass':"on",
}
postData = urllib.urlencode(postDict);
print postData
req = urllib2.Request(LoginUrl,postData)
req.add_header('Content-Type', "application/x-www-form-urlencoded")
res = urllib2.urlopen(req)

# handle the baidu music web
Html = res.read()
print Html
for index, cookie in enumerate(cj):
print '[',index, ']',cookie;
urlStr = "encodeURI\('(.*?)'\);"
urlObj = re.compile(urlStr,re.DOTALL)
matchUrl = urlObj.findall(Html)
if matchUrl:
returnUrl = matchUrl[0]
#req = urllib2.Request(returnUrl)
#res = urllib2.urlopen(req)
#print res.read()
songUrl = "http://music.baidu.com/song/120948904/download"
req = urllib2.Request(songUrl)
res = urllib2.urlopen(req)
print res.read()
# TODO: get the download link
196 changes: 196 additions & 0 deletions Hash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#include "hash.h"
#include "stdafx.h"

//外部函数
inline size_t HashTableOffset(size_t f1, size_t f2_f1, size_t t){
return (f1 * (1 << 12) + f2_f1 * (1 << 6) + t);
}
/*
size_t* ValueTableAddr(size_t* start, HashKeyInfo* pKey){
return start + pKey->start + pKey->length;
}
*/
//类方法
/************************************ Functions for build tracks.(iFlyBuild) ************************************************/
//此函数用于对于iFlyBuild的情况,从wav到Hash_Table. //Finished.
void THash::BuildInit(){
//Initialize memory value.
pValueStart = (size_t*)malloc(sizeof(size_t) * HashKeyNum * ValuePerBlock * BlockNum);
if (pValueStart==NULL)
printf("error");
memset(pValueStart, 0, sizeof(size_t) * HashKeyNum * ValuePerBlock * BlockNum);
//连续内存中,初始化已分配的结尾地址。
pValueEnd = pValueStart + sizeof(size_t) * ValuePerBlock;
//初始化HashKey表
key_info = (HashKeyInfo*)malloc(sizeof(HashKeyInfo) * HashKeyNum);
for (int i = 0; i < HashKeyNum; i++){
key_info[i].next = nullptr;
key_info[i].length = 0;
}
}

void THash::BuildUnInit(){
for (int i=0; i<HashKeyNum; i++){
HashKeyInfo *p = key_info[i].next;
HashKeyInfo *q;
while(p){
q = p;
p = p->next;
free(q);
}
}
free(key_info);
free(pValueStart);
}

//加歌名,更新歌曲数。
void THash::AddSongList(const char *filename){
strcpy_s(song_list[song_num], strlen(filename) + 1, filename);
song_num++;
}

//往Value的内存块里加数据,更新Key_table.
void THash::InsertHash(size_t f1, size_t f2_f1, size_t t, size_t id, size_t offset){
//异常处理
if (id > (1 << ID_BITS) - 1 || offset > (1 << OFFSET_BITS) - 1){
printf("The id/time offset overflow.\n");
return;
}
//当前key的地址
HashKeyInfo *pKey = &key_info[HashTableOffset(f1, f2_f1, t)];
if (pKey->length%ValuePerBlock==0){
/*在原来的连续内存中再申请一块*/
HashKeyInfo * pNode = (HashKeyInfo*)malloc(sizeof(HashKeyInfo));
/* 内存溢出 */
if ((pValueEnd - pValueStart) > OverFlowThreshold){
printf("Memory out.\n");
return;
}
pNode->start = pValueEnd;
pValueEnd += ValuePerBlock*sizeof(size_t);
pNode->length = 0;
pNode->next = pKey->next;
pKey->next = pNode;
}
size_t* pValue = pKey->next->start + pKey->next->length;
*pValue = (size_t)((id << OFFSET_BITS) + offset);
pKey->next->length++;
pKey->length++;
/* For File Write */
data_num++;
}

//将Hash表往文件里刷(不是刷整个内存,这样会在iFlySelect里浪费内存空间)
void THash::Hash2File(const char* filename){
FILE *fp;
fopen_s(&fp, filename, "wb");
if (fp == NULL){
printf("File open WRONG.\n");
}
//Write SongName
fwrite(&song_num, sizeof(size_t), 1, fp);
printf("共%d首歌\n", song_num);
for (size_t i = 0; i<song_num; i++)
fwrite(song_list[i], sizeof(char), strlen(song_list[i]) + 1, fp);
//Write hash table.
fwrite(&data_num, sizeof(size_t), 1, fp);
for (int i = 0; i<HashKeyNum; i++){
HashKeyInfo *p = key_info[i].next;
while (p){
fwrite(p->start, sizeof(size_t), p->length, fp);
p = p->next;
}
}
size_t start_place = 0;
for (int i = 0; i<HashKeyNum; i++){
fwrite(&start_place, sizeof(size_t), 1, fp);
fwrite(&key_info[i].length, sizeof(size_t), 1, fp);
start_place += key_info[i].length;
}
fclose(fp);
}

/************************************ Functions for select tracks.(iFlySelect) ************************************************/

void THash::File2Hash(const char *filename){
FILE *fp;
fopen_s(&fp, filename, "rb");
char *chp;
fread(&song_num, sizeof(size_t), 1, fp);
printf("共%d首歌\n", song_num);
for (size_t i = 0; i<song_num; i++){
chp = song_list[i];
do{
fread(chp, sizeof(char), 1, fp);
} while (*(chp++) != 0);
}
fread(&data_num, sizeof(size_t), 1, fp);
pValueStart = (size_t*)malloc(sizeof(size_t)* data_num);
fread(pValueStart, sizeof(size_t), data_num, fp);
key_table = (HashKeyTable*)malloc(sizeof(HashKeyTable)*HashKeyNum);
fread(key_table, sizeof(size_t)* 2, HashKeyNum, fp);
for (int i = 0; i<HashKeyNum; i++)
key_table[i].start = (size_t)key_table[i].start + pValueStart;
}

void THash::ReBuildInit(){
vote_table = (short **)malloc(sizeof(short*)* song_num);
for (size_t i = 0; i<song_num; i++){
vote_table[i] = (short *)malloc(sizeof(short)* (1 << OFFSET_BITS));
assert(vote_table[i] != NULL);
}
}

void THash::VoteInit(){
for (size_t i = 0; i<song_num; i++){
memset(vote_table[i], 0, sizeof(short)* (1 << OFFSET_BITS));
}
return;
}

void THash::Vote(size_t f1, size_t f2_f1, size_t t, size_t offset){
HashKeyTable *pKey = &key_table[HashTableOffset(f1, f2_f1, t)];
size_t length = pKey->length;
while (length){
length--;
size_t offset_value = (*(pKey->start + length) << ID_BITS) >> ID_BITS;
if (offset_value < offset)
continue; // 为失效投票,这种情况的投票结果为错的
vote_table[(*(pKey->start + length)) >> OFFSET_BITS][offset_value - offset]++;
}
return;
}

size_t THash::VoteResult(size_t &offset){
size_t result = 0;
short max = -1;
for (size_t i = 0; i < song_num; i++){
for (size_t j = 0; j < (1<<OFFSET_BITS); j++)
if (vote_table[i][j] > max){
max = vote_table[i][j];
result = i;
offset = j;
//if (vote_table[i][j])
// printf("%d %d %d\n", i, j, vote_table[i][j]);
}
}
return result;
}

THash::THash(){
pValueStart = nullptr;
pValueEnd = nullptr;
vote_table = nullptr;
data_num = 0;
song_num = 0;
key_info = nullptr;
song_list = (char **)malloc(MAX_SONG_NUM*sizeof(char*));
for (int i=0; i<MAX_SONG_NUM; i++)
song_list[i] = (char *)malloc(MAX_SONG_LEN*sizeof(char));
}

THash::~THash(){
for (int i=0; i<MAX_SONG_NUM; i++)
free(song_list[i]);
free(song_list);
}
68 changes: 68 additions & 0 deletions Hash.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef _HashFunc_h_
#define _HashFunc_h_

#include "stdlib.h"
#include "memory.h"
#include "stdio.h"
#include "assert.h"
#include "string.h"
#include <iostream>

#define ID_BITS 18 //基本一共可以存26万首歌(如果服务器内存足够大)。
#define OFFSET_BITS 14 //默认每首歌小于8.7分钟。
#define MAX_SONG_NUM (2<<18)
#define MAX_SONG_LEN 256
#define HashKeyNum (1<<20)
#define ValuePerBlock (1<<6) //每一块内存空间大小(64*sizeof(size_t)),用于存哈希桶中的多个value值,可扩容。
#define BlockNum 4 //Block值,用于动态扩容。
#define OverFlowThreshold 1<<28
using namespace std;

//Hash的key类型,用于iFluBuild
//(f1, f2_f1, t)
struct HashKeyInfo{
size_t* start;
size_t length;
HashKeyInfo* next; //指针用于扩容
};
//Hash的key类型,用于iFluSelect
struct HashKeyTable{
size_t *start;
size_t length;
};

class THash{
private:
public:
size_t *pValueStart;
size_t *pValueEnd;
short **vote_table;
size_t data_num;
char **song_list;
size_t song_num;
HashKeyInfo *key_info;
HashKeyTable *key_table;

THash::THash();
THash::~THash();
void THash::ReBuildInit();
/************************************ Functions for build tracks.(iFlyBuild) ************************************************/
//此函数用于对于iFlyBuild的情况,从wav到Hash_Table. //Finished.
void THash::BuildInit();
void THash::BuildUnInit();
//加歌名,更新歌曲数。 //Finished.
void THash::AddSongList(const char *filename);
//往Value的内存块里加数据,更新Key_table. //Finished.
void THash::InsertHash(size_t f1, size_t f2_f1, size_t t, size_t id, size_t offset);
//将Hash表往文件里刷(不是刷整个内存,这样会在iFlySelect里浪费内存空间)//Finished
void THash::Hash2File(const char* filename);

/************************************ Functions for select tracks.(iFlySelect) ************************************************/
size_t* THash::GetHash(size_t f1, size_t f2_f1, size_t t);
void THash::File2Hash(const char* filename);
//Functions for vote and save the top voted id to QueryId.
void THash::VoteInit();
void THash::Vote(size_t f1, size_t f2_f1, size_t t, size_t offset);
size_t THash::VoteResult(size_t &offset);
};
#endif // _HashFunc_h_
41 changes: 41 additions & 0 deletions ReadMe.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
========================================================================
动态链接库:iFlyBuild 项目概述
========================================================================

应用程序向导已为您创建了此 iFlyBuild DLL。

本文件概要介绍组成 iFlyBuild 应用程序的每个文件的内容。


iFlyBuild.vcxproj
这是使用应用程序向导生成的 VC++ 项目的主项目文件,其中包含生成该文件的 Visual C++ 的版本信息,以及有关使用应用程序向导选择的平台、配置和项目功能的信息。

iFlyBuild.vcxproj.filters
这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中,通过这种关联,在特定节点下以分组形式显示具有相似扩展名的文件。例如,“.cpp”文件与“源文件”筛选器关联。

iFlyBuild.cpp
这是主 DLL 源文件。

此 DLL 在创建时不导出任何符号。因此,生成时不会产生 .lib 文件。如果希望此项目成为其他某个项目的项目依赖项,则需要添加代码以从 DLL 导出某些符号,以便产生一个导出库,或者,也可以在项目“属性页”对话框中的“链接器”文件夹中,将“常规”属性页上的“忽略输入库”属性设置为“是”。

/////////////////////////////////////////////////////////////////////////////
应用程序向导创建了下列资源:

iFlyBuild.rc
这是程序使用的所有 Microsoft Windows 资源的列表。它包括 RES 子目录中存储的图标、位图和光标。此文件可以直接在 Microsoft Visual C++ 中进行编辑。

Resource.h
这是标准头文件,可用于定义新的资源 ID。Microsoft Visual C++ 将读取并更新此文件。

/////////////////////////////////////////////////////////////////////////////
其他标准文件:

StdAfx.h, StdAfx.cpp
这些文件用于生成名为 iFlyBuild.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。

/////////////////////////////////////////////////////////////////////////////
其他注释:

应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。

/////////////////////////////////////////////////////////////////////////////
17 changes: 17 additions & 0 deletions Resource.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//{{NO_DEPENDENCIES}}
// Microsoft Visual C++ generated include file.
// Used by iFlyBuild.rc
//

#define IDS_APP_TITLE 103

// жÔÏóµÄÏÂÒ»×éĬÈÏÖµ
//
#ifdef APSTUDIO_INVOKED
#ifndef APSTUDIO_READONLY_SYMBOLS
#define _APS_NEXT_RESOURCE_VALUE 101
#define _APS_NEXT_COMMAND_VALUE 40001
#define _APS_NEXT_CONTROL_VALUE 1000
#define _APS_NEXT_SYMED_VALUE 101
#endif
#endif
Loading

0 comments on commit af6d544

Please sign in to comment.