-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
67 lines (61 loc) · 3.09 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import argparse
import sys
import logging
import multiprocessing
from config import SQLALCHEMY_DATABASE_URI
from datetime import datetime
from model.raw_comment import RawComment
from model.pull_request import PullRequest
from analyzer.analyzer import Analyzer
from analyzer.git.git_producer import GitRecordsProducer
if __name__ == '__main__':
# Parse command line arguments.
parser = argparse.ArgumentParser(description='Analyzes required RCs and PRs. By default all.')
parser.add_argument('rcs', type=int, nargs='?', default=-1, help='Raw Comments count.')
parser.add_argument('prs', type=int, nargs='?', default=-1, help='Pull Requests count.')
parser.add_argument('--chunks', action='store_true',
help='Flag to flush records by chunks. It allows to reduce RAM load but slows down analyzing'
' speed more than 2 times.')
parser.add_argument("--train-ratio", type=float, default=0.8, help="Train-test separation ratio.")
args = parser.parse_args()
# Connect db.
Session = sessionmaker(autoflush=False)
engine = create_engine(SQLALCHEMY_DATABASE_URI)
Session.configure(bind=engine)
session = Session()
# Create logger.
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger("analyzer")
# Get required number of RC-s and PR-s.
time1 = datetime.today()
# Use all RCs.
raw_comments = session.query(RawComment).limit(args.rcs).all()
# Use only closed PRs.
prs = session.query(PullRequest).filter(PullRequest.state == "closed").limit(args.prs).all()
# Build analyzer.
analyzer = Analyzer(logger, args.chunks, GitRecordsProducer())
# TODO analyzer = Analyzer(logger, args.chunks, GitRecordsProducer(), XmlRecordsProducer(), SwiftRecordsProducer())
# Start analyze.
time2 = datetime.today()
logger.info("Load %d raw comments and %d pull requests in %s.", len(raw_comments), len(prs),
time2 - time1)
# Analyze and write to CSV files.
rc_records_count = analyzer.analyze_items(raw_comments, multiprocessing.cpu_count())
time3 = datetime.today()
logger.info("Got %d records due %d raw comments analyzing in %s.", rc_records_count, len(raw_comments),
time3 - time2)
pr_records_count = analyzer.analyze_items(prs, multiprocessing.cpu_count())
time4 = datetime.today()
logger.info("Got %d records due %d pull requests analyzing in %s.", pr_records_count, len(prs),
time4 - time3)
analyzer.finalize(args.train_ratio)
records_count = rc_records_count + pr_records_count
time5 = datetime.today()
logger.info("Dumped %d records in %s.", records_count, time5 - time4)
logger.info("Total %s for analyzing %d raw comments and %d pull requests.", time5 - time1, len(raw_comments),
len(prs))
logger.info("Percent of negative records (without RC ID) in all records is %f (%d vs %d).",
pr_records_count/records_count, rc_records_count, pr_records_count)