Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Implement model Unimp #83

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions examples/unimp/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Graph Convolutional Networks (GCN)

- Paper link: [https://arxiv.org/abs/2009.03509](https://arxiv.org/abs/2009.03509)

# Dataset Statics

| Dataset | # Nodes | # Edges | # Classes |
|----------|---------|---------|-----------|
| Cora | 2,708 | 10,556 | 7 |
| Citeseer | 3,327 | 9,228 | 6 |
| Pubmed | 19,717 | 88,651 | 3 |

Refer to [Planetoid](https://gammagl.readthedocs.io/en/latest/api/gammagl.datasets.html#gammagl.datasets.Planetoid).

Results
-------

```bash
# available dataset: "cora", "citeseer", "pubmed"
TL_BACKEND="tensorflow" python unimp_trainer.py --dataset cora
TL_BACKEND="tensorflow" python unimp_trainer.py --dataset citeseer
TL_BACKEND="tensorflow" python unimp_trainer.py --dataset pubmed
TL_BACKEND="torch" python unimp_trainer.py --dataset cora
TL_BACKEND="torch" python unimp_trainer.py --dataset citeseer
TL_BACKEND="torch" python unimp_trainer.py --dataset pubmed
```

| Dataset | Our(tf) | Our(torch) |
|----------|------------|------------|
| cora | 83.10±1.12 | 82.30±0.67 |
| citeseer | 79.90±0.68 | 78.53±0.18 |
| pubmed | 74.10±1.08 | 73.63±0.12 |
111 changes: 111 additions & 0 deletions examples/unimp/unimp_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
import random
import argparse
import tensorlayerx as tlx
from gammagl.models.unimp import Unimp
from gammagl.datasets import Planetoid
from gammagl.utils import mask_to_index
from tensorlayerx.model import TrainOneStep, WithLoss

class CrossEntropyLoss(WithLoss):
def __init__(self, model, loss_func):
super(CrossEntropyLoss, self).__init__(model,loss_func)

def forward(self, data, label):
out = self.backbone_network(data['x'], data['edge_index'])
out = tlx.gather(out, data['val_idx'])
label = tlx.reshape(tlx.gather(label, data['val_idx']),shape=(-1,))
#print(out[0])
#print(label[0])
loss = self._loss_fn(out, label)
return loss


def calculate_acc(logits, y, metrics):
metrics.update(logits, y)
rst = metrics.result()
metrics.reset()
return rst

def get_label_mask(label,node,dtype):
mask=[1 for i in range(node['train_node1'])]+[0 for i in range(node['train_node2'])]
random.shuffle(mask)
label_mask=[]
for i in range(node['train_node']):
if mask[i]==0:
label_mask.append([-1])
else:
label_mask.append([(int)(label[i])])
label_mask+=[[0] for i in range(node['num_node']-node['train_node'])]
return tlx.ops.convert_to_tensor(label_mask,dtype=dtype)

def merge_feature_label(label,feature):
return tlx.ops.concat([label,feature],axis=1)

def main(args):
dataset = Planetoid(root='./',name=args.dataset)
graph=dataset[0]
feature=graph.x
edge_index=graph.edge_index
label=graph.y
train_node=int(graph.num_nodes * 0.3)
train_node1=int(graph.num_nodes * 0.1)
node = {
'train_node': train_node,
'train_node1': train_node1,
'train_node2': train_node-train_node1,
'num_node': graph.num_nodes
}
val_mask = tlx.ops.concat(
[tlx.ops.zeros((train_node, 1),dtype=tlx.int32),
tlx.ops.ones((train_node-train_node1, 1),dtype=tlx.int32)],axis=0)
test_mask=graph.test_mask
model=Unimp(dataset)
loss = tlx.losses.softmax_cross_entropy_with_logits
optimizer = tlx.optimizers.Adam(lr=0.01, weight_decay=5e-4)
train_weights = model.trainable_weights
loss_func = CrossEntropyLoss(model, loss)
train_one_step = TrainOneStep(loss_func, optimizer, train_weights)
val_idx = mask_to_index(val_mask)
test_idx = mask_to_index(test_mask)
metrics = tlx.metrics.Accuracy()
data = {
"x": feature,
"y": label,
"edge_index": edge_index,
"val_idx":val_idx,
"test_idx": test_idx,
"num_nodes": graph.num_nodes,
}

epochs=args.epochs
best_val_acc=0
for epoch in range(epochs):
model.set_train()
label_mask=get_label_mask(label,node,feature[0].dtype)
data['x']=merge_feature_label(label_mask,feature)
train_loss = train_one_step(data, graph.y)

model.set_eval()
logits = model(data['x'], data['edge_index'])
test_logits = tlx.gather(logits, data['test_idx'])
test_y = tlx.gather(data['y'], data['test_idx'])
test_acc = calculate_acc(test_logits, test_y, metrics)

print("Epoch [{:0>3d}] ".format(epoch + 1)
+ " train loss: {:.4f}".format(train_loss.item())
+ " val acc: {:.4f}".format(test_acc))

# save best model on evaluation set
if test_acc > best_val_acc:
best_val_acc = test_acc
model.save_weights('./'+ 'unimp' + ".npz", format='npz_dict')
print("The Best ACC : {:.4f}".format(best_val_acc))

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=200, help="number of epoch")
parser.add_argument('--dataset', type=str, default='cora', help='dataset')
args = parser.parse_args()
main(args)
16 changes: 16 additions & 0 deletions gammagl/datasets/OgbGraphData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,ogbg-molbace,ogbg-molbbbp,ogbg-molclintox,ogbg-molmuv,ogbg-molpcba,ogbg-molsider,ogbg-moltox21,ogbg-moltoxcast,ogbg-molhiv,ogbg-molesol,ogbg-molfreesolv,ogbg-mollipo,ogbg-molchembl,ogbg-ppa,ogbg-code2
num tasks,1,1,2,17,128,27,12,617,1,1,1,1,1310,1,1
eval metric,rocauc,rocauc,rocauc,ap,ap,rocauc,rocauc,rocauc,rocauc,rmse,rmse,rmse,rocauc,acc,F1
download_name,bace,bbbp,clintox,muv,pcba,sider,tox21,toxcast,hiv,esol,freesolv,lipophilicity,chembl,ogbg_ppi_medium,code2
version,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
url,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bace.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bbbp.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/clintox.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/muv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/sider.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/tox21.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/toxcast.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/esol.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/freesolv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/lipophilicity.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/chembl.zip,http://snap.stanford.edu/ogb/data/graphproppred/ogbg_ppi_medium.zip,http://snap.stanford.edu/ogb/data/graphproppred/code2.zip
add_inverse_edge,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
data type,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,,
has_node_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
has_edge_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
task type,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,regression,regression,regression,binary classification,multiclass classification,subtoken prediction
num classes,2,2,2,2,2,2,2,2,2,-1,-1,-1,2,37,-1
split,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,species,project
additional node files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,"node_is_attributed,node_dfs_order,node_depth"
additional edge files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None
binary,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
14 changes: 14 additions & 0 deletions gammagl/datasets/OgbLinkData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
,ogbl-ppa,ogbl-collab,ogbl-citation2,ogbl-wikikg2,ogbl-ddi,ogbl-biokg,ogbl-vessel
eval metric,hits@100,hits@50,mrr,mrr,hits@20,mrr,rocauc
task type,link prediction,link prediction,link prediction,KG completion,link prediction,KG completion,link prediction
download_name,ppassoc,collab,citation-v2,wikikg-v2,ddi,biokg,vessel
version,1,1,1,1,1,1,1
url,http://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip,http://snap.stanford.edu/ogb/data/linkproppred/collab.zip,http://snap.stanford.edu/ogb/data/linkproppred/citation-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/wikikg-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/ddi.zip,http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip,http://snap.stanford.edu/ogb/data/linkproppred/vessel.zip
add_inverse_edge,True,True,False,False,True,False,False
has_node_attr,True,True,True,False,False,False,True
has_edge_attr,False,False,False,False,False,False,True
split,throughput,time,time,time,target,random,spatial
additional node files,None,None,node_year,None,None,None,None
additional edge files,None,"edge_weight,edge_year",None,edge_reltype,None,edge_reltype,None
is hetero,False,False,False,False,False,True,False
binary,False,False,False,False,False,False,True
16 changes: 16 additions & 0 deletions gammagl/datasets/OgbNodeData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,ogbn-proteins,ogbn-products,ogbn-arxiv,ogbn-mag,ogbn-papers100M
num tasks,112,1,1,1,1
num classes,2,47,40,349,172
eval metric,rocauc,acc,acc,acc,acc
task type,binary classification,multiclass classification,multiclass classification,multiclass classification,multiclass classification
download_name,proteins,products,arxiv,mag,papers100M-bin
version,1,1,1,2,1
url,http://snap.stanford.edu/ogb/data/nodeproppred/proteins.zip,http://snap.stanford.edu/ogb/data/nodeproppred/products.zip,http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip,http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip,http://snap.stanford.edu/ogb/data/nodeproppred/papers100M-bin.zip
add_inverse_edge,True,True,False,False,False
has_node_attr,False,True,True,True,True
has_edge_attr,True,False,False,False,False
split,species,sales_ranking,time,time,time
additional node files,node_species,None,node_year,node_year,node_year
additional edge files,None,None,None,edge_reltype,None
is hetero,False,False,False,True,False
binary,False,False,False,False,True
156 changes: 156 additions & 0 deletions gammagl/datasets/ogb_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from gammagl.data import InMemoryDataset
from gammagl.data.download import download_url
from gammagl.data.extract import extract_zip
from gammagl.io.read_ogb import read_graph


class OgbGraphDataset(InMemoryDataset):
def __init__(self, name, root = 'dataset', transform=None, pre_transform = None, meta_dict = None):
'''
- name (str): name of the dataset
- root (str): root directory to store the dataset folder
- transform, pre_transform (optional): transform/pre-transform graph objects

- meta_dict: dictionary that stores all the meta-information about data. Default is None,
but when something is passed, it uses its information. Useful for debugging for external contributers.
'''

self.name = name ## original name, e.g., ogbg-molhiv

if meta_dict is None:
self.dir_name = '_'.join(name.split('-'))

# check if previously-downloaded folder exists.
# If so, use that one.
if osp.exists(osp.join(root, self.dir_name + '_gammagl')):
self.dir_name = self.dir_name + '_gammagl'

self.original_root = root
self.root = osp.join(root, self.dir_name)

master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbGraphData.csv'), index_col = 0)
if not self.name in master:
error_mssg = 'Invalid dataset name {}.\n'.format(self.name)
error_mssg += 'Available datasets are as follows:\n'
error_mssg += '\n'.join(master.keys())
raise ValueError(error_mssg)
self.meta_info = master[self.name]

else:
self.dir_name = meta_dict['dir_path']
self.original_root = ''
self.root = meta_dict['dir_path']
self.meta_info = meta_dict

# check version
# First check whether the dataset has been already downloaded or not.
# If so, check whether the dataset version is the newest or not.
# If the dataset is not the newest version, notify this to the user.
if osp.isdir(self.root) and (not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))):
print(self.name + ' has been updated.')
if input('Will you update the dataset now? (y/N)\n').lower() == 'y':
shutil.rmtree(self.root)

self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., tox21

self.num_tasks = int(self.meta_info['num tasks'])
self.eval_metric = self.meta_info['eval metric']
self.task_type = self.meta_info['task type']
self.__num_classes__ = int(self.meta_info['num classes'])
self.binary = self.meta_info['binary'] == 'True'

super(OgbGraphDataset, self).__init__(self.root, transform, pre_transform)

self.data, self.slices = self.load_data(self.processed_paths[0])

def get_idx_split(self, split_type = None):
if split_type is None:
split_type = self.meta_info['split']

path = osp.join(self.root, 'split', split_type)

# short-cut if split_dict.pt exists
if os.path.isfile(os.path.join(path, 'split_dict.pt')):
return self.load_data(os.path.join(path, 'split_dict.pt'))

train_idx = pd.read_csv(osp.join(path, 'train.csv.gz'), compression='gzip', header = None).values.T[0]
valid_idx = pd.read_csv(osp.join(path, 'valid.csv.gz'), compression='gzip', header = None).values.T[0]
test_idx = pd.read_csv(osp.join(path, 'test.csv.gz'), compression='gzip', header = None).values.T[0]

return {'train': train_idx, 'valid': valid_idx, 'test': test_idx}

@property
def num_classes(self):
return self.__num_classes__

@property
def raw_file_names(self):
if self.binary:
return ['data.npz']
else:
file_names = ['edge']
if self.meta_info['has_node_attr'] == 'True':
file_names.append('node-feat')
if self.meta_info['has_edge_attr'] == 'True':
file_names.append('edge-feat')
return [file_name + '.csv.gz' for file_name in file_names]

@property
def processed_file_names(self):
return 'geometric_data_processed.pt'

def download(self):
url = self.meta_info['url']
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
shutil.rmtree(self.root)
shutil.move(osp.join(self.original_root, self.download_name), self.root)


def process(self):
add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

if self.meta_info['additional node files'] == 'None':
additional_node_files = []
else:
additional_node_files = self.meta_info['additional node files'].split(',')

if self.meta_info['additional edge files'] == 'None':
additional_edge_files = []
else:
additional_edge_files = self.meta_info['additional edge files'].split(',')

data_list = read_graph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)

if self.task_type == 'subtoken prediction':
graph_label_notparsed = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values
graph_label = [str(graph_label_notparsed[i][0]).split(' ') for i in range(len(graph_label_notparsed))]

for i, g in enumerate(data_list):
g.y = graph_label[i]

else:
if self.binary:
graph_label = np.load(osp.join(self.raw_dir, 'graph-label.npz'))['graph_label']
else:
graph_label = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values

has_nan = np.isnan(graph_label).any()

for i, g in enumerate(data_list):
g.y = graph_label[i]

if self.pre_transform is not None:
data_list = [self.pre_transform(data) for data in data_list]

data, slices = self.collate(data_list)

print('Saving...')
self.save_data((data, slices), self.processed_paths[0])


Loading