diff --git a/src/utils/README.md b/src/utils/README.md new file mode 100644 index 0000000..fad4b5c --- /dev/null +++ b/src/utils/README.md @@ -0,0 +1,3 @@ +# `src/utils` + +The scripts in this folder are utility scripts that are not required for mining or analysis. \ No newline at end of file diff --git a/src/utils/create_representative_set_github.py b/src/utils/create_representative_set_github.py index 7146859..ae9695d 100644 --- a/src/utils/create_representative_set_github.py +++ b/src/utils/create_representative_set_github.py @@ -1,10 +1,10 @@ """ Samples 100 repositories from Github based on different amounts of stars. -Some metadata about these repositories is extracted and stored in a dataframe which is written to ../data/representative_set.csv. +Some metadata about these repositories is extracted and stored in a dataframe. """ from github import Github, GithubException -import json +import argparse import pandas as pd import configparser from tqdm import tqdm @@ -16,7 +16,7 @@ def get_access_token(): str: Access Token """ config = configparser.ConfigParser() - config.read('../config.cfg') + config.read('../../config.cfg') return config['ACCESS']['token'] def parse_samples(slice): @@ -49,6 +49,15 @@ def compose_repo_link(row) -> str: return link if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="create_representative_set_github", + description="Sample 100 repositories from Github based on different amounts of stars." + ) + parser.add_argument("-o", "--output", type=str, + help="output path for representative set", + default="../../data/debug/representative_set.csv") + args = parser.parse_args() + g = Github(get_access_token()) samples = {} stars_intervals = ["<1", "1..100", "100..1000", "1000..10000", ">10000"] @@ -57,4 +66,4 @@ def compose_repo_link(row) -> str: samples[interval] = parse_samples(result[:20]) df = pd.concat(samples.values()) df["github_id"] = df.apply(compose_repo_link, axis=1) - df.to_csv("../data/representative_set.csv", index=False) + df.to_csv(args.output, index=False) diff --git a/src/utils/rep_set_test.csv b/src/utils/rep_set_test.csv new file mode 100644 index 0000000..b30e946 --- /dev/null +++ b/src/utils/rep_set_test.csv @@ -0,0 +1,101 @@ +user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb,github_id +jairourrego123,AppCalorias,0,1,0,20,1,621,jairourrego123/AppCalorias +hetpatel-web,SystemDevMetho,0,1,0,0,0,0,hetpatel-web/SystemDevMetho +morlzz111,mz.github.io,0,1,0,2,1,290,morlzz111/mz.github.io +manjeshboyapati,manjesh2,0,1,0,1,1,30,manjeshboyapati/manjesh2 +ds4owd-001,md-02-assignments-AlejandraSuarezCobos,0,1,0,3,1,1412,ds4owd-001/md-02-assignments-AlejandraSuarezCobos +Arancium98,ucihydraulic,0,1,0,13,1,175197,Arancium98/ucihydraulic +Peramitul,Peramitul,0,1,0,0,0,0,Peramitul/Peramitul +acciojob,auto-complete-letsmailvjkumar,0,2,0,2,2,394,acciojob/auto-complete-letsmailvjkumar +JuanCohetes,traductor,0,1,0,2,1,58,JuanCohetes/traductor +DMG-01,ETH_TRANSFER_APP,0,1,0,8,1,173,DMG-01/ETH_TRANSFER_APP +distinctm1nd,rosy_cross_badge,0,1,0,0,0,0,distinctm1nd/rosy_cross_badge +aurelienheude,P5-Print-it-JS,0,1,0,5,1,3651,aurelienheude/P5-Print-it-JS +Andynico,Arabica_custom,0,1,0,35741,1,1066436,Andynico/Arabica_custom +MarcinhoLetsCode,Jan-Ken-Po_MultiPlayer_V1,0,1,0,1,1,0,MarcinhoLetsCode/Jan-Ken-Po_MultiPlayer_V1 +Doston3300,barlow,0,1,0,1,1,2890,Doston3300/barlow +ava-smith,ava-smith,0,1,0,9,1,9,ava-smith/ava-smith +jesusdpdev,my-app-contador,0,1,0,2,1,194,jesusdpdev/my-app-contador +andrii-cherevko,andrii-cherevko.github.io,0,1,0,11,1,2962,andrii-cherevko/andrii-cherevko.github.io +Greavis,telebot_moderator,0,1,0,7,1,12,Greavis/telebot_moderator +imSHUBHANKAR,php-ProjectCA3,0,1,0,0,0,0,imSHUBHANKAR/php-ProjectCA3 +RedisGraph,redisgraph.js,100,8,26,207,10,11654,RedisGraph/redisgraph.js +leoafarias,neardb,100,6,9,219,5,2272,leoafarias/neardb +vueComponent,vue-ref,100,5,14,13,2,19,vueComponent/vue-ref +now1then,vue-h5-pro,100,6,50,12,1,2476,now1then/vue-h5-pro +ivaylokenov,CSharp-ORM-Battle,99,4,1,7,1,2213,ivaylokenov/CSharp-ORM-Battle +HazyResearch,fonduer-tutorials,100,18,26,100,5,8514,HazyResearch/fonduer-tutorials +recoilme,slowpoke,100,8,9,141,4,2334,recoilme/slowpoke +Peihao2021,O-NKU,100,13,8,3,1,68320,Peihao2021/O-NKU +meizhiju,layered-bilstm-crf,100,5,26,3,1,56,meizhiju/layered-bilstm-crf +corellium,preloader-m1,100,13,9,14,2,60,corellium/preloader-m1 +Kimundi,greenwasm,100,7,2,243,1,568,Kimundi/greenwasm +shouldnotappearcalm,yapi-plugin-interface-oauth2-token,100,5,22,55,4,83867,shouldnotappearcalm/yapi-plugin-interface-oauth2-token +elizarov,CoroutinesWorkshop,100,13,15,35,1,51979,elizarov/CoroutinesWorkshop +pathwar,pathwar,100,8,26,2864,28,37461,pathwar/pathwar +devpew,muffinReactNative,100,4,3,17,1,201,devpew/muffinReactNative +Nielk1,VSCView,100,10,5,299,4,18890,Nielk1/VSCView +0xC45,homelab-setup,100,3,8,65,1,106,0xC45/homelab-setup +erocoar,ggpol,100,4,10,168,3,1423,erocoar/ggpol +fancompute,qpga,100,10,16,42,2,15637,fancompute/qpga +imohamad,twitter-downloader-telegram-bot,100,5,31,5,0,5,imohamad/twitter-downloader-telegram-bot +ldqk,Masuit.MyBlogs,1000,36,285,1182,3,508818,ldqk/Masuit.MyBlogs +vuejs,test-utils,1000,26,236,2060,144,6050,vuejs/test-utils +dagger8224,dagger.js,1000,16,44,181,3,757,dagger8224/dagger.js +zhihu,rucene,1000,31,60,273,4,1915,zhihu/rucene +ratwithacompiler,OBS-captions-plugin,1000,26,69,189,2,2582,ratwithacompiler/OBS-captions-plugin +201853910,VMwareWorkstation,999,21,196,38,1,56,201853910/VMwareWorkstation +nccgroup,singularity,1000,32,142,210,5,3313,nccgroup/singularity +Kethsar,ytarchive,1000,26,87,302,16,213,Kethsar/ytarchive +Spu7Nix,SPWN-language,1000,18,61,1130,28,4166,Spu7Nix/SPWN-language +taigaio,taiga-docker,1000,16,270,127,13,563,taigaio/taiga-docker +labulakalia,crocodile,1000,20,163,161,6,14288,labulakalia/crocodile +colbyfayock,50-projects-for-react-and-the-static-web,1000,21,141,201,12,168,colbyfayock/50-projects-for-react-and-the-static-web +edvardHua,PoseEstimationForMobile,1000,53,261,86,5,167185,edvardHua/PoseEstimationForMobile +iximiuz,client-go-examples,1000,22,126,57,5,160,iximiuz/client-go-examples +open-source-labs,SvelteStorm,1000,17,114,690,27,35955,open-source-labs/SvelteStorm +kakaobrain,kogpt,1000,17,131,43,7,81,kakaobrain/kogpt +Rikj000,MoniGoMani,1000,64,162,1193,24,271688,Rikj000/MoniGoMani +Tsojan,TsojanScan,1000,14,56,13,1,51,Tsojan/TsojanScan +The-XSS-Rat,SecurityTesting,1000,47,258,165,1,15022,The-XSS-Rat/SecurityTesting +AGI-Edgerunners,LLM-Adapters,1000,12,90,180,5,76750,AGI-Edgerunners/LLM-Adapters +openspug,spug,9995,194,2025,1223,9,5538,openspug/spug +Baiyuetribe,paper2gui,9988,116,832,84,3,138737,Baiyuetribe/paper2gui +nvim-treesitter,nvim-treesitter,9985,48,845,5289,370,7114,nvim-treesitter/nvim-treesitter +SimplifyJobs,New-Grad-Positions,9968,1305,953,2078,263,2259,SimplifyJobs/New-Grad-Positions +pingcap,talent-plan,9957,251,1279,568,96,4110,pingcap/talent-plan +twitter,the-algorithm-ml,9954,101,2242,2,0,109,twitter/the-algorithm-ml +neovim,nvim-lspconfig,9946,84,2017,2970,445,4070,neovim/nvim-lspconfig +alexandresanlim,Badges4-README.md-Profile,9945,47,1532,1602,226,1465,alexandresanlim/Badges4-README.md-Profile +m-bain,whisperX,9940,123,1002,368,66,24060,m-bain/whisperX +Dujltqzv,Some-Many-Books,9939,122,1371,4,1,10,Dujltqzv/Some-Many-Books +aristocratos,bpytop,9926,155,406,408,32,1343,aristocratos/bpytop +sfyc23,EverydayWechat,9922,206,2265,214,13,359,sfyc23/EverydayWechat +lyhue1991,eat_tensorflow2_in_30_days,9912,269,2427,272,3,61324,lyhue1991/eat_tensorflow2_in_30_days +xenova,transformers.js,9910,71,592,1081,28,104477,xenova/transformers.js +chaitin,xray,9905,206,1781,821,108,35749,chaitin/xray +veeral-patel,how-to-secure-anything,9899,226,671,353,5,45025,veeral-patel/how-to-secure-anything +microsoft,wslg,9897,117,304,241,37,2031,microsoft/wslg +microsoft,STL,9894,249,1468,2079,200,29399,microsoft/STL +soxoj,maigret,9892,93,780,913,32,5860,soxoj/maigret +kubescape,kubescape,9891,96,816,3022,126,111568,kubescape/kubescape +codecrafters-io,build-your-own-x,40000,5227,26391,563,117,1065,codecrafters-io/build-your-own-x +996icu,996.ICU,40000,4224,21522,3205,398,187804,996icu/996.ICU +trekhleb,javascript-algorithms,40000,4359,29390,1104,195,13248,trekhleb/javascript-algorithms +CyC2018,CS-Notes,40000,5321,49300,3781,215,116179,CyC2018/CS-Notes +Significant-Gravitas,AutoGPT,40000,1562,43391,5357,440,129809,Significant-Gravitas/AutoGPT +jackfrued,Python-100-Days,40000,6137,51039,380,12,332089,jackfrued/Python-100-Days +Snailclimb,JavaGuide,40000,4523,44927,5419,416,175380,Snailclimb/JavaGuide +trimstray,the-book-of-secret-knowledge,40000,2410,8922,1068,98,1811,trimstray/the-book-of-secret-knowledge +AUTOMATIC1111,stable-diffusion-webui,40000,1049,25333,7384,430,35862,AUTOMATIC1111/stable-diffusion-webui +huggingface,transformers,40000,1099,24800,16262,433,234333,huggingface/transformers +labuladong,fucking-algorithm,40000,2311,22473,496,84,125717,labuladong/fucking-algorithm +microsoft,PowerToys,40000,1142,6613,7359,418,365855,microsoft/PowerToys +f,awesome-chatgpt-prompts,40000,1385,14631,434,80,759,f/awesome-chatgpt-prompts +GrowingGit,GitHub-Chinese-Top-Charts,40000,2576,12421,920,1,99430,GrowingGit/GitHub-Chinese-Top-Charts +denoland,deno,40000,1414,5340,11670,434,129247,denoland/deno +langchain-ai,langchain,40000,670,13956,10190,477,246053,langchain-ai/langchain +massgravel,Microsoft-Activation-Scripts,40000,895,8443,142,5,7997,massgravel/Microsoft-Activation-Scripts +microsoft,Web-Dev-For-Beginners,40000,2701,12222,1689,211,86694,microsoft/Web-Dev-For-Beginners +iptv-org,iptv,40000,1869,2198,30079,278,628268,iptv-org/iptv +tauri-apps,tauri,40000,498,2507,4677,356,83797,tauri-apps/tauri