diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eb9fb9f..aad17ea 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,14 @@ repos: - id: flake8 exclude: ^causalnex/ebaybbn +- repo: https://github.com/codespell-project/codespell + rev: v2.2.5 + hooks: + - id: codespell + args: + - --skip=docs/source/* + - --ignore-words-list=fro,jaques,fpr,te + - repo: https://github.com/pre-commit/mirrors-isort rev: v4.3.21 hooks: diff --git a/causalnex/discretiser/discretiser_strategy.py b/causalnex/discretiser/discretiser_strategy.py index 480cca1..41e4609 100644 --- a/causalnex/discretiser/discretiser_strategy.py +++ b/causalnex/discretiser/discretiser_strategy.py @@ -245,7 +245,7 @@ def __init__( Args: min_depth: The minimum depth of the interval splitting. - min_split: The minmum size to split a bin + min_split: The minimum size to split a bin dtype: The type of the array returned by the `transform()` method **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP` Raises: diff --git a/causalnex/ebaybbn/bbn.py b/causalnex/ebaybbn/bbn.py index f85ddea..8c9455a 100644 --- a/causalnex/ebaybbn/bbn.py +++ b/causalnex/ebaybbn/bbn.py @@ -76,7 +76,7 @@ def __init__(self, nodes_dict, name=None, domains={}): # variable it 'introduced'. # Note that we cannot record # this duing Node instantiation - # becuase at that point we do + # because at that point we do # not yet know *which* of the # variables in the argument # list is the one being modeled @@ -220,7 +220,7 @@ def initialize_potentials(self, assignments, bbn, evidence={}): # Step 2: Note that in H&D the assignments are # done as part of step 2 however we have - # seperated the assignment algorithm out and + # separated the assignment algorithm out and # done these prior to step 1. # Now for each assignment we want to # generate a truth-table from the @@ -302,7 +302,7 @@ def assign_clusters(self, bbn): # once and once only. The example # in H&D just happens to be a clique # that f_a could have been assigned - # to but wasnt presumably because + # to but wasn't presumably because # it got assigned somewhere else. pass # continue @@ -313,7 +313,7 @@ def assign_clusters(self, bbn): family = set(args) # At this point we need to know which *variable* # a BBN node represents. Up to now we have - # not *explicitely* specified this, however + # not *explicitly* specified this, however # we have been following some conventions # so we could just use this convention for # now. Need to come back to this to @@ -426,8 +426,8 @@ def marginal(self, bbn_node): for node in self.clique_nodes: if bbn_node.name in [n.name for n in node.clique.nodes]: containing_nodes.append(node) - # In theory it doesnt matter which one we - # use so we could bale out after we + # In theory it doesn't matter which one we + # use so we could able out after we # find the first one # TODO: With some better indexing we could # avoid searching for this node every time... @@ -540,7 +540,7 @@ def pass_message(self, target): logging.debug(" Send the summed marginals to the target: %s ", str(sepset_node)) - # Step 2 absorbtion + # Step 2 absorption self.absorb(sepset_node, target) def project(self, sepset_node): @@ -572,7 +572,7 @@ def absorb(self, sepset, target): # Assign a new potential tt to # Y (the target) logging.debug( - "Absorb potentails from sepset node %s into clique %s", + "Absorb potentials from sepset node %s into clique %s", sepset.name, target.name, ) @@ -650,7 +650,7 @@ def insert(self, forest): cliques are in different trees, means that effectively we are collapsing the two trees into - one. We will explicitely perform + one. We will explicitly perform this collapse by adding the sepset node into the tree and adding edges between itself diff --git a/causalnex/network/network.py b/causalnex/network/network.py index 4ff3738..4467b67 100644 --- a/causalnex/network/network.py +++ b/causalnex/network/network.py @@ -402,7 +402,7 @@ def fit_cpds( regardless of variable cardinality; - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` - for each node. Use equivelant_sample_size. + for each node. Use equivalent_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: @@ -463,7 +463,7 @@ def fit_node_states_and_cpds( regardless of variable cardinality; - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` - for each node. Use equivelant_sample_size. + for each node. Use equivalent_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: diff --git a/causalnex/structure/data_generators/wrappers.py b/causalnex/structure/data_generators/wrappers.py index f8abac1..2077267 100644 --- a/causalnex/structure/data_generators/wrappers.py +++ b/causalnex/structure/data_generators/wrappers.py @@ -620,7 +620,7 @@ def gen_stationary_dyn_net_and_df( # pylint: disable=R0913, R0914 w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay sem_type: {linear-gauss,linear-exp,linear-gumbel} noise_scale: scale parameter of noise distribution in linear SEM - max_data_gen_trials: maximun number of attempts until obtaining a seemingly stationary model + max_data_gen_trials: maximum number of attempts until obtaining a seemingly stationary model Returns: Tuple with: - the model created,as a Structure model diff --git a/causalnex/structure/pytorch/notears.py b/causalnex/structure/pytorch/notears.py index 68e367c..a229822 100644 --- a/causalnex/structure/pytorch/notears.py +++ b/causalnex/structure/pytorch/notears.py @@ -344,7 +344,7 @@ def from_pandas( **kwargs, ) - # set comprehension to ensure only unique dist types are extraced + # set comprehension to ensure only unique dist types are extracted # NOTE: this prevents double-renaming caused by the same dist type used on expanded columns unique_dist_types = {node[1]["dist_type"] for node in g.nodes(data=True)} # use the dist types to update the idx_col mapping @@ -375,7 +375,7 @@ def from_pandas( node_name = idx_col_expanded[node[0]] sm.nodes[node_name]["bias"] = node[1]["bias"] - # recover and preseve the node dist_types + # recover and preserve the node dist_types for node_data in g.nodes(data=True): node_name = idx_col_expanded[node_data[0]] sm.nodes[node_name]["dist_type"] = node_data[1]["dist_type"] diff --git a/causalnex/structure/pytorch/sklearn/_base.py b/causalnex/structure/pytorch/sklearn/_base.py index 5eb0d51..f69125d 100644 --- a/causalnex/structure/pytorch/sklearn/_base.py +++ b/causalnex/structure/pytorch/sklearn/_base.py @@ -82,7 +82,7 @@ def __init__( alpha: l1 loss weighting. When using nonlinear layers this is only applied to the first layer. - beta: l2 loss weighting. Applied across all layers. Reccomended to use this + beta: l2 loss weighting. Applied across all layers. Recommended to use this when fitting nonlinearities. fit_intercept: Whether to fit an intercept in the structure model @@ -111,7 +111,7 @@ def __init__( standardize: Whether to standardize the X and y variables before fitting. The L-BFGS algorithm used to fit the underlying NOTEARS works best on data - all of the same scale so this parameter is reccomended. + all of the same scale so this parameter is recommended. notears_mlp_kwargs: Additional arguments for the NOTEARS MLP model. @@ -160,7 +160,7 @@ def __init__( self.target_dist_type = target_dist_type self.notears_mlp_kwargs = notears_mlp_kwargs - # sklearn wrapper paramters + # sklearn wrapper parameters self.dependent_target = dependent_target self.enforce_dag = enforce_dag self.standardize = standardize diff --git a/causalnex/utils/pgmpy_utils.py b/causalnex/utils/pgmpy_utils.py index f83c791..39b987e 100644 --- a/causalnex/utils/pgmpy_utils.py +++ b/causalnex/utils/pgmpy_utils.py @@ -113,7 +113,7 @@ def cpd_multiplication( Args: cpds: cpds to multiply - normalize: wether to normalise the columns, so that each column sums to 1 + normalize: whether to normalise the columns, so that each column sums to 1 Returns: Pandas dataframe containing the resulting product, looking like a cpd diff --git a/devel-gpu.Dockerfile b/devel-gpu.Dockerfile index 32cc996..6a22f02 100644 --- a/devel-gpu.Dockerfile +++ b/devel-gpu.Dockerfile @@ -4,7 +4,7 @@ ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y && apt install -y python3.8 python3-pip RUN ln -s $(which python3) /usr/local/bin/python -# Copy all files to container as intalling .[pytorch] requires setup.py, which requires other files +# Copy all files to container as installing .[pytorch] requires setup.py, which requires other files COPY . /tmp WORKDIR /tmp diff --git a/tests/ebaybbn/test_ebaybbn.py b/tests/ebaybbn/test_ebaybbn.py index 17a7b14..cc605da 100644 --- a/tests/ebaybbn/test_ebaybbn.py +++ b/tests/ebaybbn/test_ebaybbn.py @@ -287,7 +287,7 @@ def priority_func_override(node): def test_initialize_potentials(self, huang_darwiche_jt, huang_darwiche_dag): # Seems like there can be multiple assignments so - # for this test we will set the assignments explicitely + # for this test we will set the assignments explicitly cliques = {node.name: node for node in huang_darwiche_jt.nodes} bbn_nodes = {node.name: node for node in huang_darwiche_dag.nodes} assignments = { diff --git a/tests/estimator/test_em.py b/tests/estimator/test_em.py index 585fbd8..f8cbf49 100644 --- a/tests/estimator/test_em.py +++ b/tests/estimator/test_em.py @@ -187,7 +187,7 @@ def get_correct_cpds( class TestEMJobs: @pytest.mark.parametrize("n_jobs", [1, 3, -2]) def test_em_no_missing_data(self, n_jobs): - """If all data for the latent variable is provided, the result is the same as runing bn.fit_cpds""" + """If all data for the latent variable is provided, the result is the same as running bn.fit_cpds""" df, sm, node_states, true_lv_values = naive_bayes_plus_parents( percentage_not_missing=1 ) diff --git a/tests/structure/data_generators/test_core.py b/tests/structure/data_generators/test_core.py index 76d7eb1..efc851a 100644 --- a/tests/structure/data_generators/test_core.py +++ b/tests/structure/data_generators/test_core.py @@ -331,7 +331,7 @@ def test_mixed_type_independence( seed=seed, ) - atol = 0.02 # at least 2% difference bewteen joint & factored! + atol = 0.02 # at least 2% difference between joint & factored! # 1. dependent links # 0 -> 1 (we look at the class with the highest deviation from uniform # to avoid small values) diff --git a/tests/structure/data_generators/test_wrappers.py b/tests/structure/data_generators/test_wrappers.py index de144f7..61281a2 100644 --- a/tests/structure/data_generators/test_wrappers.py +++ b/tests/structure/data_generators/test_wrappers.py @@ -427,7 +427,7 @@ def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel @pytest.mark.parametrize("seed", (10, 20, 30)) def test_independence(self, graph_gen, seed, num_nodes): """ - test whether the relation is accurate, implicitely tests sequence of + test whether the relation is accurate, implicitly tests sequence of nodes. """ @@ -633,7 +633,7 @@ def test_intercept(self, distribution, n_categories, noise_scale): @pytest.mark.parametrize("distribution", ["probit", "logit"]) def test_independence(self, graph_gen, seed, num_nodes, n_categories, distribution): """ - test whether the relation is accurate, implicitely tests sequence of + test whether the relation is accurate, implicitly tests sequence of nodes. """ sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=None) diff --git a/tests/structure/test_dist_type.py b/tests/structure/test_dist_type.py index a577ed8..b4dffb8 100644 --- a/tests/structure/test_dist_type.py +++ b/tests/structure/test_dist_type.py @@ -176,7 +176,7 @@ def test_preprocess_X_expanded_cols(self, dist_type, X): # check size of column expansion assert X.shape[1] == 6 - # check that the correct indecies are pulled out + # check that the correct indices are pulled out assert dist_types[0].idx_group == [0, 2, 3] assert dist_types[1].idx_group == [1, 4, 5] # test that the expanded get_columns works diff --git a/tests/structure/test_dynotears.py b/tests/structure/test_dynotears.py index 31094e6..0ff60b3 100644 --- a/tests/structure/test_dynotears.py +++ b/tests/structure/test_dynotears.py @@ -266,7 +266,7 @@ def test_edges_contain_weight(self, data_dynotears_p2): ) def test_certain_relationships_get_near_certain_weight(self): - """If a == b always, ther should be an edge a->b or b->a with coefficient close to one""" + """If a == b always, there should be an edge a->b or b->a with coefficient close to one""" np.random.seed(17) data = pd.DataFrame( @@ -281,7 +281,7 @@ def test_certain_relationships_get_near_certain_weight(self): assert 0.99 < edge <= 1.01 def test_inverse_relationships_get_negative_weight(self): - """If a == -b always, ther should be an edge a->b or b->a with coefficient close to minus one""" + """If a == -b always, there should be an edge a->b or b->a with coefficient close to minus one""" np.random.seed(17) data = pd.DataFrame( @@ -565,7 +565,7 @@ def test_edges_contain_weight(self, data_dynotears_p3): ) def test_certain_relationships_get_near_certain_weight(self): - """If a == b always, ther should be an edge a->b or b->a with coefficient close to one""" + """If a == b always, there should be an edge a->b or b->a with coefficient close to one""" np.random.seed(17) data = pd.DataFrame( diff --git a/tests/structure/test_notears.py b/tests/structure/test_notears.py index 1daffde..887ee7f 100644 --- a/tests/structure/test_notears.py +++ b/tests/structure/test_notears.py @@ -716,7 +716,7 @@ def test_non_negativity_constraint(self, train_data_idx): """ The optimisation in notears lasso involves reshaping the initial similarity matrix into two strictly positive matrixes (w+ and w-) and imposing a non negativity constraint - to the solver. We test here if these two contraints are imposed. + to the solver. We test here if these two constraints are imposed. We check if: (1) bounds impose non negativity constraint diff --git a/tests/structure/test_pytorch_notears.py b/tests/structure/test_pytorch_notears.py index c8044da..1f970a9 100644 --- a/tests/structure/test_pytorch_notears.py +++ b/tests/structure/test_pytorch_notears.py @@ -226,7 +226,7 @@ def test_check_array(self, data): from_pandas(pd.DataFrame(data=data, columns=["a"])) def test_f1score_generated_binary(self): - """Binary strucutre learned should have good f1 score""" + """Binary structure learned should have good f1 score""" np.random.seed(10) sm = generate_structure(5, 2.0) df = generate_binary_dataframe( @@ -255,7 +255,7 @@ def test_f1score_generated_binary(self): assert f1_score > 0.8 def test_f1score_generated_poisson(self): - """Poisson strucutre learned should have good f1 score""" + """Poisson structure learned should have good f1 score""" np.random.seed(10) sm = generate_structure(5, 3.0) df = generate_count_dataframe( @@ -445,7 +445,7 @@ def test_non_negativity_constraint(self, train_data_idx): """ The optimisation in notears lasso involves reshaping the initial similarity matrix into two strictly positive matrixes (w+ and w-) and imposing a non negativity constraint - to the solver. We test here if these two contraints are imposed. + to the solver. We test here if these two constraints are imposed. We check if: (1) bounds impose non negativity constraint diff --git a/tests/test_bayesiannetwork.py b/tests/test_bayesiannetwork.py index 2c0110b..c4d0a96 100644 --- a/tests/test_bayesiannetwork.py +++ b/tests/test_bayesiannetwork.py @@ -815,7 +815,7 @@ def test_set_bad_cpd(self, bn, bad_cpd): with pytest.raises( ValueError, - match=r"Sum or integral of conditional probabilites for node b is not equal to 1.", + match=r"Sum or integral of conditional probabilities for node b is not equal to 1.", ): bn.set_cpd("b", bad_cpd)