Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 179ce54

Browse files
committed
Better docs and docstrings
1 parent 4c80e5d commit 179ce54

File tree

6 files changed

+53
-6
lines changed

6 files changed

+53
-6
lines changed

data_diff/__init__.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,17 @@ def connect_to_table(
1515
key_column: str = "id",
1616
thread_count: Optional[int] = 1,
1717
**kwargs,
18-
):
18+
) -> TableSegment:
1919
"""Connects to the given database, and creates a TableSegment instance
2020
2121
Parameters:
2222
db_info: Either a URI string, or a dict of connection options.
2323
table_name: Name of the table as a string, or a tuple that signifies the path.
2424
key_column: Name of the key column
25-
thread_count: Number of threads for this connection (only if using a threadpooled implementation)
25+
thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
26+
27+
See Also:
28+
:meth:`connect`
2629
"""
2730

2831
db = connect(db_info, thread_count=thread_count)
@@ -61,13 +64,39 @@ def diff_tables(
6164
# There may be many pools, so number of actual threads can be a lot higher.
6265
max_threadpool_size: Optional[int] = 1,
6366
) -> Iterator:
64-
"""Efficiently finds the diff between table1 and table2.
67+
"""Finds the diff between table1 and table2.
68+
69+
Parameters:
70+
key_column (str): Name of the key column, which uniquely identifies each row (usually id)
71+
update_column (str, optional): Name of updated column, which signals that rows changed (usually updated_at or last_update).
72+
Used by `min_update` and `max_update`.
73+
extra_columns (Tuple[str, ...], optional): Extra columns to compare
74+
min_key (:data:`DbKey`, optional): Lowest key_column value, used to restrict the segment
75+
max_key (:data:`DbKey`, optional): Highest key_column value, used to restrict the segment
76+
min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
77+
max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
78+
algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
79+
bisection_factor (int): Into how many segments to bisect per iteration. (when algorithm is `HASHDIFF`)
80+
bisection_threshold (Number): When should we stop bisecting and compare locally (when algorithm is `HASHDIFF`; in row count).
81+
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
82+
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto. Only relevant when `threaded` is ``True``.
83+
There may be many pools, so number of actual threads can be a lot higher.
84+
85+
Note:
86+
The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
87+
`key_column`, `update_column`, `extra_columns`, `min_key`, `max_key`. If different values are needed per table, it's
88+
possible to omit them here, and instead set them directly when creating each :class:`TableSegment`.
6589
6690
Example:
6791
>>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
6892
>>> list(diff_tables(table1, table1))
6993
[]
7094
95+
See Also:
96+
:class:`TableSegment`
97+
:class:`HashDiffer`
98+
:class:`JoinDiffer`
99+
71100
"""
72101
tables = [table1, table2]
73102
override_attrs = {

data_diff/joindiff_tables.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,15 @@ def json_friendly_value(v):
7676

7777
@dataclass
7878
class JoinDifferBase(TableDiffer):
79-
"""Finds the diff between two SQL tables using JOINs"""
79+
"""Finds the diff between two SQL tables using JOINs
80+
81+
The two tables must reside in the same database, and their primary keys must be unique and not null.
82+
83+
Parameters:
84+
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
85+
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto. Only relevant when `threaded` is ``True``.
86+
There may be many pools, so number of actual threads can be a lot higher.
87+
"""
8088

8189
stats: dict = {}
8290
validate_unique_key: bool = True

data_diff/table_segment.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ class TableSegment:
2323
database (Database): Database instance. See :meth:`connect`
2424
table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')`
2525
key_column (str): Name of the key column, which uniquely identifies each row (usually id)
26-
update_column (str, optional): Name of updated column, which signals that rows changed (usually updated_at or last_update)
26+
update_column (str, optional): Name of updated column, which signals that rows changed (usually updated_at or last_update).
27+
Used by `min_update` and `max_update`.
2728
extra_columns (Tuple[str, ...], optional): Extra columns to compare
2829
min_key (:data:`DbKey`, optional): Lowest key_column value, used to restrict the segment
2930
max_key (:data:`DbKey`, optional): Highest key_column value, used to restrict the segment

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"recommonmark",
4242
"sphinx_markdown_tables",
4343
"sphinx_copybutton",
44+
"enum_tools.autoenum",
4445
# 'sphinx_gallery.gen_gallery'
4546
]
4647

docs/python-api.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ Python API Reference
55
66
.. autofunction:: connect
77

8+
.. autofunction:: connect_to_table
9+
10+
.. autofunction:: diff_tables
11+
812
.. autoclass:: HashDiffer
913
:members: __init__, diff_tables
1014

@@ -17,6 +21,10 @@ Python API Reference
1721
.. autoclass:: data_diff.databases.database_types.AbstractDatabase
1822
:members:
1923

24+
.. autoclass:: data_diff.databases.database_types.AbstractDialect
25+
:members:
26+
2027
.. autodata:: DbKey
2128
.. autodata:: DbTime
2229
.. autodata:: DbPath
30+
.. autoenum:: Algorithm

docs/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ sphinx_markdown_tables
44
sphinx-copybutton
55
sphinx-rtd-theme
66
recommonmark
7+
enum-tools[sphinx]
78

8-
# Requirements. TODO Use poetry instead of this redundant list
99
data_diff

0 commit comments

Comments
 (0)