diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..8a81993
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,17 @@
+version: 2
+
+build:
+  os: ubuntu-24.04
+  tools:
+      python: "3.8"
+  jobs:
+    post_install:
+      - sphinx-apidoc -M -o docs/generated sparsecoding
+      - pip install -r requirements.txt
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+      - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/README.md b/README.md
index 56c2b86..a1a568a 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,37 @@
-# Sparse Coding
+# RCTN SparseCoding Library
 
-Reference sparse coding implementations for efficient learning and inference implemented in PyTorch with GPU support.
+[![Documentation status](https://readthedocs.org/projects/sparsecoding/badge/)](
+https://sparsecoding.readthedocs.io/)
 
-## Features
+`sparsecoding` is a Python library developed by UC Berkeley's [Redwood Center for Theoretical Neuroscience (RCTN)](https://redwood.berkeley.edu). It provides efficient, batched, and GPU-compatible [PyTorch](https://github.com/pytorch/pytorch) implementations for sparse coding related-algorithms, including dictionary learning, inference, and data processing.
+
+Historically, sparse coding has been largely focused on learning sparse representations of images, and we provide visualization and transformation tools to work with such data. However, we structure the transformation, dictionary learning methods, and inference methods in a manner that is data-agnostic, making them applicable to a wide range of use cases.
 
-### Dictionary Learning
 
-* Repo currently includes classic patch-wise sparse coding dictionary learning.
+## Features
 
-### Implemented Inference Methods
+- Check out our [Quickstart Guide](https://sparsecoding.readthedocs.io/en/latest/quickstart.html) for an overview and setup instructions.
+- Refer to the [API Reference](https://sparsecoding.readthedocs.io/en/latest/api.html) for detailed usage of the library's features.
 
-* Locally Competative Algorithm (LCA)
-* Gradient Descent with Euler's method on Laplace Prior (Vanilla)
-* Laplacian Scale Mixture (LSM)
-* Iterative Shrinkage-threshold Algorithm (ISTA)
-* Generic PyTorch minimization of arbitrary loss function (PyTorchOptimizer)
 
 ## Setup
 
-1. Clone the repo.
-2. Navigate to the directory containing the repo directory.
-3. Run `pip install -e ".[all]"`
-4. Install the natural images dataset from this link: https://rctn.org/bruno/sparsenet/IMAGES.mat
-5. Try running the demo notebook: `examples/sparse_coding.ipynb`
+To install the library, follow these steps:
+
+```bash
+git clone https://github.com/rctn/sparsecoding.git
+cd sparsecoding
+pip install -e ".[all]"
+```
+
+Try running the demo notebook: `examples/sparse_coding.ipynb`.
+
+For more detailed instructions, see our [Installation Guide](https://sparsecoding.readthedocs.io/en/latest/install.html).
 
-Note: If you are using a Jupyter notebook and change a source file, you can either: 1) restart the Jupyter kernel, or 2) follow instructions [here](https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload).
+Note: If you're using a Jupyter notebook and make changes to the source files, you can either:
+* Restart the Jupyter kernel, or
+* Use the autoreload extension as explained [here](https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload).
 
-## Contributing
 
-See the [contributing](docs/contributing.md) document!
+# Contributing
+We welcome contributions! Please see our [contributing](https://sparsecoding.readthedocs.io/en/latest/contributing.html) for details on how to get involved.
diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 0000000..fdc1a77
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,67 @@
+=============
+API Reference
+=============
+
+.. py:currentmodule:: sparsecoding
+
+Dictionary learning models
+--------------------------
+
+.. automodule:: sparsecoding.models
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Inference methods
+-----------------
+
+.. automodule:: sparsecoding.inference
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Visualization tools
+-------------------
+
+.. automodule:: sparsecoding.visualization
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Priors
+------
+
+.. automodule:: sparsecoding.priors
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Datasets
+--------
+
+.. automodule:: sparsecoding.datasets
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Data transformations
+--------------------
+
+.. automodule:: sparsecoding.transforms.whiten
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Image transformations
+---------------------
+
+.. automodule:: sparsecoding.transforms.images
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..6f3a0c9
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,44 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+
+project = 'RCTN sparsecoding'
+copyright = '2024, RCTN'
+author = 'RCTN'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    'sphinx.ext.napoleon',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+]
+
+autodoc_default_options = {
+    'members': True,
+    'undoc-members': True,
+    'show-inheritance': True,
+    'inherited-members': False,
+}
+
+exclude_patterns = [
+    '**/test_*',
+    '**/*_test.py',
+]
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "/")
diff --git a/docs/contributing.md b/docs/contributing.md
deleted file mode 100644
index 5a49a47..0000000
--- a/docs/contributing.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Contributing
-
-All contributions are welcome!
-
-## Bug Reporting
-
-If you find a bug, submit a bug report on GitHub Issues.
-
-## Adding Features/Fixing Bugs
-
-If you have identified a new feature or bug that you can fix yourself, please follow the following procedure.
-
-1. Fork `main` branch.
-2. Create a new branch to contain your changes.
-3. `add`, `commit`, and `push` your changes to this branch.
-4. Create a pull request (PR). See more information on submitting a PR request below.
-
-### Submitting a Pull Request
-
-1. If necessary, please **write your own unit tests** and place them near the code being tested. High-level tests, such as integration or example tests can be placed in the top-level "tests" folder.
-2. Verify that all tests are passed by running `python -m pytest .`.
-3. Be sure that your PR follows formatting guidelines, [PEP8](https://peps.python.org/pep-0008/) and [flake8](https://flake8.pycqa.org/en/latest/).
-4. Make sure the title of your PR summarizes the features/issues resolved in your branch.
-5. Submit your pull request and add reviewers.
-
-## Coding Style Guidelines
-
-The following are some guidelines on how new code should be written. Of course, there are special cases, and there will be exceptions to these rules.
-
-1. Format code in accordance with [flake8](https://flake8.pycqa.org/en/latest/) standard.
-2. Use underscores to separate words in non-class names: `n_samples` rather than `nsamples`.
-3. Avoid single-character variable names.
-
-## Docstrings
-
-When writing docstrings, please follow the following example.
-
-```py
-def count_beans(self, baz, use_gpu=False, foo="vector"
-                bar=None):
-   """Write a one-line summary for the method.
-
-   Parameters
-   ----------
-   baz : array-like, shape [..., dim]
-      Write a short description of parameter baz.
-   use_gpu : bool, default=False
-      Write a short description of parameter use_gpu.
-   foo : str, {"vector", "matrix"}, default="vector"
-      Write a short description of parameter foo.
-   bar : array-like, shape [...,], optional
-      Write a short description of parameter bar.
-
-   Returns
-   -------
-   n_beans : array-like, shape [..., dim, dim]
-      Write a short description of the result returned by the method.
-
-   Notes
-   -----
-   If relevant, provide equations with (:math:)
-   describing computations performed in the method.
-
-   Example
-   -------
-   Provide code snippets showing how the method is used.
-   You can link to scripts of the examples/ directory.
-
-   Reference
-   ---------
-   If relevant, provide a reference with associated pdf or
-   wikipedia page.
-   ex: 
-   [1] Einstein, A., Podolsky, B., & Rosen, N. (1935). Can 
-   quantum-mechanical description of physical reality be 
-   considered complete?. Physical review, 47(10), 777.
-   """
-```
diff --git a/docs/contributing.rst b/docs/contributing.rst
new file mode 100644
index 0000000..95791bd
--- /dev/null
+++ b/docs/contributing.rst
@@ -0,0 +1,63 @@
+============
+Contributing
+============
+
+We welcome all contributions to this project! Whether it’s reporting bugs, suggesting features, 
+fixing issues, or improving documentation, your input is invaluable.
+
+Bug Reporting
+-------------
+
+If you encounter a bug, please report it by creating an issue on GitHub. Include as much detail as 
+possible to help us reproduce and fix the issue.
+
+Adding Features or Fixing Bugs
+------------------------------
+
+If you’ve identified a new feature to add or a bug you can fix, follow these steps:
+
+#. Clone the ``main`` branch.
+#. Create a new branch to work on your changes. Use a descriptive name for your branch, such as 
+   ``fix-issue-123`` or ``feature-add-logging``.
+#. Use ``add``, ``commit``, and ``push`` to save your changes to the new branch.
+#. Create a pull request (PR). See the "Submitting a Pull Request" section for more details.
+
+Submitting a Pull Request
+-------------------------
+To ensure a smooth review process and maintain high code quality, follow these guidelines when 
+submitting a PR:
+
+#. If applicable, write unit tests for your changes. We use the 
+   `pytest <https://pytest.readthedocs.io/>`_ framework. Every Python module, extension module, 
+   or subpackage in the sparsecoding package directory should have a corresponding ``test_<name>.py`` 
+   file. Pytest examines these files for test methods (named ``test*``) and test classes (named 
+   ``Test*``). Add your tests to the appropriate ``test_*.py`` (create this file if it doesn't 
+   already exist).
+#. Verify that all tests pass by running ``pytest sparsecoding/`` from the base repository directory.
+#. Ensure your code adheres to the formatting guidelines specified in 
+   `PEP8 <https://peps.python.org/pep-0008/>`_ and validated by 
+   `flake8 <https://flake8.pycqa.org/en/latest/>`_.
+#. Prepare a detailed and clear PR description: 
+   
+   * Summarize the purpose of the PR and the changes made.
+   
+   * Include any relevant context, such as links to related issues or discussions.
+   
+   * Specify testing steps or considerations for reviewers.
+
+#. Submit your PR and assign reviewers as necessary.
+#. Reviewers: Use squash and merge when merging the PR.
+
+   * Set the merge description to match the PR description.
+
+   * Squash commits into a single commit to maintain a clean project history.
+
+
+Coding Style Guidelines
+-----------------------
+
+We follow the `NumPy documentation standards <https://numpydoc.readthedocs.io/en/latest/format.html>`_.
+
+1. Format your code according to the `flake8 <https://flake8.pycqa.org/en/latest/>`_ standard.
+2. Use underscores to separate words in non-class names (e.g., ``n_samples`` instead of ``nsamples``).
+3. Avoid single-character variable names.
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..0227015
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,35 @@
+.. sparsecoding documentation master file, created by
+   sphinx-quickstart on Fri Nov 15 13:23:58 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+#######################################
+RCTN sparsecoding library documentation
+#######################################
+
+`sparsecoding`_ is a python library developed by the `Redwood Center for 
+Theoretical Neuroscience (RCTN) <https://redwood.berkeley.edu>`_ which contains 
+performant `pytorch <https://github.com/pytorch/pytorch>`_ implementations of sparse coding dictionary learning, 
+inference, and data processing. It was written to be a useful research tool 
+for applying various sparse coding methods to data. 
+
+We believe that sharing code within the scientific community is an important 
+part of science and we hope that the research community finds this library 
+useful. 
+
+.. _sparsecoding: https://github.com/rctn/sparsecoding/
+
+.. toctree::
+   :maxdepth: 1
+   :numbered:
+
+   install
+   quickstart
+   api
+   contributing
+
+
+License
+-------
+
+`sparsecoding`_ has a BSD-3-clause license, as found in the `LICENSE <https://github.com/rctn/sparsecoding/blob/main/LICENSE>`_ file.
\ No newline at end of file
diff --git a/docs/install.rst b/docs/install.rst
new file mode 100644
index 0000000..5514996
--- /dev/null
+++ b/docs/install.rst
@@ -0,0 +1,19 @@
+============
+Installation
+============
+
+The quickest way to install the library is by cloning it directly from GitHub:
+
+.. code:: bash
+
+    git clone https://github.com/rctn/sparsecoding.git
+    cd sparsecoding
+    pip install -e ".[all]"
+
+The last command installs the dependencies required for the RCTN sparse coding library, including:
+
+- ``numpy``
+- ``scipy``
+- ``matplotlib``
+- ``torch``
+- ``torchvision``
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
new file mode 100644
index 0000000..bc6ccb1
--- /dev/null
+++ b/docs/quickstart.rst
@@ -0,0 +1,40 @@
+==========
+Quickstart
+==========
+
+Overview
+--------
+
+.. _sparsecoding: https://github.com/rctn/sparsecoding/
+
+`sparsecoding`_ is a Python package that provides tools for implementing sparse coding algorithms. 
+Traditionally, sparse coding has been primarily used for learning sparse representations of images. 
+To support this, we include tools for visualization and data transformation specific to image data. 
+However, we have designed the dictionary learning and inference methods to be data-agnostic, 
+allowing for broader applications.
+
+The `sparsecoding`_ library is built largely on PyTorch, enabling it to inherit several 
+performance benefits, such as:
+
+- GPU support
+- Batched operations
+- Auto-grad optimizers
+
+Structure of the Library
+-------------------------
+
+The functionalities of `sparsecoding`_ are organized into several modules:
+
+- ``sparsecoding.models``: Contains dictionary learning models (e.g., SparseCoding).
+- ``sparsecoding.inference``: Includes algorithms for computing latent coefficients.
+- ``sparsecoding.visualization``: Provides tools for visualizing image dictionaries and data.
+- ``sparsecoding.priors``: Offers methods for sampling from various sparse coding priors.
+- ``sparsecoding.datasets``: Contains utilities for loading datasets.
+- ``sparsecoding.transforms``: Includes methods for working with data, such as whitening and 
+  extracting patches from images.
+
+Getting Started
+---------------
+
+Explore our `example notebooks <https://github.com/rctn/sparsecoding/tree/main/examples>`_ 
+to get started.
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..9ab2504
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,3 @@
+.
+sphinx
+sphinx-rtd-theme
\ No newline at end of file
diff --git a/sparsecoding/__init__.py b/sparsecoding/__init__.py
index 74847a6..d233a2c 100644
--- a/sparsecoding/__init__.py
+++ b/sparsecoding/__init__.py
@@ -1 +1,42 @@
-"Modules for sparse coding."
+from .models import SparseCoding
+from .inference import LCA, IHT, ISTA, LSM, MP, OMP, Vanilla, PyTorchOptimizer
+from .visualization import plot_dictionary, plot_patches
+from .priors import SpikeSlabPrior, L0Prior
+from .datasets import BarsDataset, FieldDataset
+from .dictionaries import (
+    load_dictionary_from_pickle,
+    load_bars_dictionary,
+    load_olshausen_dictionary,
+)
+
+__all__ = [
+    # Models
+    "SparseCoding",
+
+    # Inference
+    "LCA",
+    "IHT",
+    "ISTA",
+    "LSM",
+    "MP",
+    "OMP",
+    "Vanilla",
+    "PyTorchOptimizer",
+
+    # Visualization
+    "plot_dictionary",
+    "plot_patches",
+
+    # Priors
+    "SpikeSlabPrior",
+    "L0Prior",
+
+    # Dictionaries
+    "load_dictionary_from_pickle",
+    "load_bars_dictionary",
+    "load_olshausen_dictionary",
+
+    # Datasets
+    "BarsDataset",
+    "FieldDataset",
+]
diff --git a/sparsecoding/data/__init__.py b/sparsecoding/data/__init__.py
deleted file mode 100644
index 6f39034..0000000
--- a/sparsecoding/data/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Module for exposing datasets to users."""
diff --git a/sparsecoding/data/datasets/__init__.py b/sparsecoding/data/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sparsecoding/data/dictionaries/__init__.py b/sparsecoding/data/dictionaries/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sparsecoding/data/utils.py b/sparsecoding/data/utils.py
deleted file mode 100644
index 7eccfc2..0000000
--- a/sparsecoding/data/utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import torch
-import numpy as np
-import pickle as pkl
-
-MODULE_PATH = os.path.dirname(__file__)
-DATASET_PATH = os.path.join(MODULE_PATH, "datasets")
-DICTIONARY_PATH = os.path.join(MODULE_PATH, "dictionaries")
-
-BARS_DICT_PATH = os.path.join(DICTIONARY_PATH, "bars", "bars-16_by_16.p")
-OLSHAUSEN_DICT_PATH = os.path.join(DICTIONARY_PATH, "olshausen", "olshausen-1.5x_overcomplete.p")
-
-
-def load_dictionary_from_pickle(path):
-    dictionary_file = open(path, 'rb')
-    numpy_dictionary = pkl.load(dictionary_file)
-    dictionary_file.close()
-    dictionary = torch.tensor(numpy_dictionary.astype(np.float32))
-    return dictionary
-
-
-def load_bars_dictionary():
-    path = BARS_DICT_PATH
-    dictionary_file = open(path, 'rb')
-    numpy_dictionary = pkl.load(dictionary_file)
-    dictionary_file.close()
-    dictionary = torch.tensor(numpy_dictionary.astype(np.float32))
-    return dictionary
-
-
-def load_olshausen_dictionary():
-    path = OLSHAUSEN_DICT_PATH
-    dictionary_file = open(path, 'rb')
-    numpy_dictionary = pkl.load(dictionary_file)
-    dictionary_file.close()
-    dictionary = torch.tensor(numpy_dictionary.astype(np.float32))
-    return dictionary
diff --git a/sparsecoding/inference/iht.py b/sparsecoding/inference/iht.py
index 467cfc5..39ee55d 100644
--- a/sparsecoding/inference/iht.py
+++ b/sparsecoding/inference/iht.py
@@ -9,25 +9,23 @@ class IHT(InferenceMethod):
     Infer coefficients for each image in data using elements dictionary.
     Method description can be traced to
     "Iterative Hard Thresholding for Compressed Sensing" (T. Blumensath & M. E. Davies, 2009)
+
+    Parameters
+    ----------
+    sparsity : float
+        Sparsity of the solution. The number of active coefficients will be set
+        to ceil(sparsity * data_dim) at the end of each iterative update.
+    n_iter : int, default=10
+        number of iterations to run for an inference method
+    return_all_coefficients : str, default=False
+        returns all coefficients during inference procedure if True
+        user beware: if n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This function typically used for
+        debugging
+    solver : default=None
     """
 
     def __init__(self, sparsity, n_iter=10, solver=None, return_all_coefficients=False):
-        """
-
-        Parameters
-        ----------
-        sparsity : scalar (1,)
-            Sparsity of the solution. The number of active coefficients will be set
-            to ceil(sparsity * data_dim) at the end of each iterative update.
-        n_iter : scalar (1,) default=100
-            number of iterations to run for an inference method
-        return_all_coefficients : string (1,) default=False
-            returns all coefficients during inference procedure if True
-            user beware: if n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This function typically used for
-            debugging
-        solver : default=None
-        """
         super().__init__(solver)
         self.n_iter = n_iter
         self.sparsity = sparsity
diff --git a/sparsecoding/inference/ista.py b/sparsecoding/inference/ista.py
index 42bd21b..dec12d1 100644
--- a/sparsecoding/inference/ista.py
+++ b/sparsecoding/inference/ista.py
@@ -4,6 +4,33 @@
 
 
 class ISTA(InferenceMethod):
+    """
+    Iterative shrinkage-thresholding algorithm for solving LASSO problems.
+
+    Parameters
+    ----------
+    n_iter : int, default=100
+        Number of iterations to run
+    sparsity_penalty : float, default=0.2
+
+    stop_early : bool, default=False
+        Stops dynamics early based on change in coefficents
+    epsilon : float, default=1e-2
+        Only used if stop_early True, specifies criteria to stop dynamics
+    return_all_coefficients : str, default=False
+        Returns all coefficients during inference procedure if True
+        User beware: if n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This
+        function typically used for debugging.
+    solver : default=None
+
+    References
+    ----------
+    [1] Beck, A., & Teboulle, M. (2009). A fast iterative
+    shrinkage-thresholding algorithm for linear inverse problems.
+    SIAM journal on imaging sciences, 2(1), 183-202.
+    """
+
     def __init__(
         self,
         n_iter=100,
@@ -13,31 +40,6 @@ def __init__(
         solver=None,
         return_all_coefficients=False,
     ):
-        """Iterative shrinkage-thresholding algorithm for solving LASSO problems.
-
-        Parameters
-        ----------
-        n_iter : int, default=100
-            Number of iterations to run
-        sparsity_penalty : float, default=0.2
-
-        stop_early : bool, default=False
-            Stops dynamics early based on change in coefficents
-        epsilon : float, default=1e-2
-            Only used if stop_early True, specifies criteria to stop dynamics
-        return_all_coefficients : str, default=False
-            Returns all coefficients during inference procedure if True
-            User beware: if n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This
-            function typically used for debugging.
-        solver : default=None
-
-        References
-        ----------
-        [1] Beck, A., & Teboulle, M. (2009). A fast iterative
-        shrinkage-thresholding algorithm for linear inverse problems.
-        SIAM journal on imaging sciences, 2(1), 183-202.
-        """
         super().__init__(solver)
         self.n_iter = n_iter
         self.sparsity_penalty = sparsity_penalty
diff --git a/sparsecoding/inference/lca.py b/sparsecoding/inference/lca.py
index ad5023f..61eb0e4 100644
--- a/sparsecoding/inference/lca.py
+++ b/sparsecoding/inference/lca.py
@@ -4,6 +4,41 @@
 
 
 class LCA(InferenceMethod):
+    """
+    Method implemented according locally competative algorithm (LCA)
+    with the ideal soft thresholding function.
+
+    Parameters
+    ----------
+    n_iter : int, default=100
+        Number of iterations to run
+    coeff_lr : float, default=1e-3
+        Update rate of coefficient dynamics
+    threshold : float, default=0.1
+        Threshold for non-linearity
+    stop_early : bool, default=False
+        Stops dynamics early based on change in coefficents
+    epsilon : float, default=1e-2
+        Only used if stop_early True, specifies criteria to stop dynamics
+    nonnegative : bool, default=False
+        Constrain coefficients to be nonnegative
+    return_all_coefficients : str, {"none", "membrane", "active"}, default="none"
+        Returns all coefficients during inference procedure if not equal
+        to "none". If return_all_coefficients=="membrane", membrane
+        potentials (u) returned. If return_all_coefficients=="active",
+        active units (a) (output of thresholding function over u) returned.
+        User beware: if n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This
+        function typically used for debugging.
+    solver : default=None
+
+    References
+    ----------
+    [1] Rozell, C. J., Johnson, D. H., Baraniuk, R. G., & Olshausen,
+    B. A. (2008). Sparse coding via thresholding and local competition
+    in neural circuits. Neural computation, 20(10), 2526-2563.
+    """
+
     def __init__(
         self,
         n_iter=100,
@@ -15,39 +50,6 @@ def __init__(
         return_all_coefficients="none",
         nonnegative=False,
     ):
-        """Method implemented according locally competative algorithm (LCA)
-        with the ideal soft thresholding function.
-
-        Parameters
-        ----------
-        n_iter : int, default=100
-            Number of iterations to run
-        coeff_lr : float, default=1e-3
-            Update rate of coefficient dynamics
-        threshold : float, default=0.1
-            Threshold for non-linearity
-        stop_early : bool, default=False
-            Stops dynamics early based on change in coefficents
-        epsilon : float, default=1e-2
-            Only used if stop_early True, specifies criteria to stop dynamics
-        nonnegative : bool, default=False
-            Constrain coefficients to be nonnegative
-        return_all_coefficients : str, {"none", "membrane", "active"}, default="none"
-            Returns all coefficients during inference procedure if not equal
-            to "none". If return_all_coefficients=="membrane", membrane
-            potentials (u) returned. If return_all_coefficients=="active",
-            active units (a) (output of thresholding function over u) returned.
-            User beware: if n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This
-            function typically used for debugging.
-        solver : default=None
-
-        References
-        ----------
-        [1] Rozell, C. J., Johnson, D. H., Baraniuk, R. G., & Olshausen,
-        B. A. (2008). Sparse coding via thresholding and local competition
-        in neural circuits. Neural computation, 20(10), 2526-2563.
-        """
         super().__init__(solver)
         self.threshold = threshold
         self.coeff_lr = coeff_lr
diff --git a/sparsecoding/inference/lsm.py b/sparsecoding/inference/lsm.py
index 83e5ed5..636610b 100644
--- a/sparsecoding/inference/lsm.py
+++ b/sparsecoding/inference/lsm.py
@@ -4,6 +4,40 @@
 
 
 class LSM(InferenceMethod):
+    """
+    Infer latent coefficients generating data given dictionary.
+    Method implemented according to "Group Sparse Coding with a Laplacian
+    Scale Mixture Prior" (P. J. Garrigues & B. A. Olshausen, 2010)
+
+    Parameters
+    ----------
+    n_iter : int, default=100
+        Number of iterations to run for an optimizer
+    n_iter_LSM : int, default=6
+        Number of iterations to run the outer loop of  LSM
+    beta : float, default=0.01
+        LSM parameter used to update lambdas
+    alpha : float, default=80.0
+        LSM parameter used to update lambdas
+    sigma : float, default=0.005
+        LSM parameter used to compute the loss function
+    sparse_threshold : float, default=10**-2
+        Threshold used to discard smallest coefficients in the final
+        solution SM parameter used to compute the loss function
+    return_all_coefficients : bool, default=False
+        Returns all coefficients during inference procedure if True
+        User beware: If n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This
+        function typically used for debugging.
+    solver : default=None
+
+    References
+    ----------
+    [1] Garrigues, P., & Olshausen, B. (2010). Group sparse coding with
+    a laplacian scale mixture prior. Advances in neural information
+    processing systems, 23.
+    """
+
     def __init__(
         self,
         n_iter=100,
@@ -15,38 +49,6 @@ def __init__(
         solver=None,
         return_all_coefficients=False,
     ):
-        """Infer latent coefficients generating data given dictionary.
-        Method implemented according to "Group Sparse Coding with a Laplacian
-        Scale Mixture Prior" (P. J. Garrigues & B. A. Olshausen, 2010)
-
-        Parameters
-        ----------
-        n_iter : int, default=100
-            Number of iterations to run for an optimizer
-        n_iter_LSM : int, default=6
-            Number of iterations to run the outer loop of  LSM
-        beta : float, default=0.01
-            LSM parameter used to update lambdas
-        alpha : float, default=80.0
-            LSM parameter used to update lambdas
-        sigma : float, default=0.005
-            LSM parameter used to compute the loss function
-        sparse_threshold : float, default=10**-2
-            Threshold used to discard smallest coefficients in the final
-            solution SM parameter used to compute the loss function
-        return_all_coefficients : bool, default=False
-            Returns all coefficients during inference procedure if True
-            User beware: If n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This
-            function typically used for debugging.
-        solver : default=None
-
-        References
-        ----------
-        [1] Garrigues, P., & Olshausen, B. (2010). Group sparse coding with
-        a laplacian scale mixture prior. Advances in neural information
-        processing systems, 23.
-        """
         super().__init__(solver)
         self.n_iter = n_iter
         self.n_iter_LSM = n_iter_LSM
diff --git a/sparsecoding/inference/mp.py b/sparsecoding/inference/mp.py
index 4305976..c402f3b 100644
--- a/sparsecoding/inference/mp.py
+++ b/sparsecoding/inference/mp.py
@@ -9,22 +9,20 @@ class MP(InferenceMethod):
     Infer coefficients for each image in data using elements dictionary.
     Method description can be traced
     to "Matching Pursuits with Time-Frequency Dictionaries" (S. G. Mallat & Z. Zhang, 1993)
+
+    Parameters
+    ----------
+    sparsity : scalar (1,)
+        sparsity of the solution
+    return_all_coefficients : string (1,) default=False
+        returns all coefficients during inference procedure if True
+        user beware: if n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This function typically used for
+        debugging
+    solver : default=None
     """
 
     def __init__(self, sparsity, solver=None, return_all_coefficients=False):
-        """
-
-        Parameters
-        ----------
-        sparsity : scalar (1,)
-            sparsity of the solution
-        return_all_coefficients : string (1,) default=False
-            returns all coefficients during inference procedure if True
-            user beware: if n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This function typically used for
-            debugging
-        solver : default=None
-        """
         super().__init__(solver)
         self.sparsity = sparsity
         self.return_all_coefficients = return_all_coefficients
diff --git a/sparsecoding/inference/omp.py b/sparsecoding/inference/omp.py
index db99427..b11789b 100644
--- a/sparsecoding/inference/omp.py
+++ b/sparsecoding/inference/omp.py
@@ -10,22 +10,20 @@ class OMP(InferenceMethod):
     Method description can be traced to:
         "Orthogonal Matching Pursuit: Recursive Function Approximation with Application to Wavelet Decomposition"
         (Y. Pati & R. Rezaiifar & P. Krishnaprasad, 1993)
+
+    Parameters
+    ----------
+    sparsity : scalar (1,)
+        sparsity of the solution
+    return_all_coefficients : string (1,) default=False
+        returns all coefficients during inference procedure if True
+        user beware: if n_iter is large, setting this parameter to True
+        can result in large memory usage/potential exhaustion. This function typically used for
+        debugging
+    solver : default=None
     """
 
     def __init__(self, sparsity, solver=None, return_all_coefficients=False):
-        """
-
-        Parameters
-        ----------
-        sparsity : scalar (1,)
-            sparsity of the solution
-        return_all_coefficients : string (1,) default=False
-            returns all coefficients during inference procedure if True
-            user beware: if n_iter is large, setting this parameter to True
-            can result in large memory usage/potential exhaustion. This function typically used for
-            debugging
-        solver : default=None
-        """
         super().__init__(solver)
         self.sparsity = sparsity
         self.return_all_coefficients = return_all_coefficients
diff --git a/sparsecoding/inference/pytorch_optimizer.py b/sparsecoding/inference/pytorch_optimizer.py
index 7c13079..bc98c88 100644
--- a/sparsecoding/inference/pytorch_optimizer.py
+++ b/sparsecoding/inference/pytorch_optimizer.py
@@ -4,24 +4,26 @@
 
 
 class PyTorchOptimizer(InferenceMethod):
-    def __init__(self, optimizer_f, loss_f, n_iter=100, solver=None):
-        """Infer coefficients using provided loss functional and optimizer
+    """
+    Infer coefficients using provided loss functional and optimizer.
+
+    Parameters
+    ----------
+    optimizer : function handle
+        Pytorch optimizer handle have single parameter:
+            (coefficients)
+        where coefficients is of shape [batch_size, n_basis]
+    loss_f : function handle
+        Must have parameters:
+                (data, dictionary, coefficients)
+        where data is of shape [batch_size, n_features]
+        and loss_f must return tensor of size [batch_size,]
+    n_iter : int, default=100
+        Number of iterations to run for an optimizer
+    solver : default=None
+    """
 
-        Parameters
-        ----------
-        optimizer : function handle
-            Pytorch optimizer handle have single parameter:
-                (coefficients)
-            where coefficients is of shape [batch_size, n_basis]
-        loss_f : function handle
-            Must have parameters:
-                 (data, dictionary, coefficients)
-            where data is of shape [batch_size, n_features]
-            and loss_f must return tensor of size [batch_size,]
-        n_iter : int, default=100
-            Number of iterations to run for an optimizer
-        solver : default=None
-        """
+    def __init__(self, optimizer_f, loss_f, n_iter=100, solver=None):
         super().__init__(solver)
         self.optimizer_f = optimizer_f
         self.loss_f = loss_f
diff --git a/sparsecoding/inference/vanilla.py b/sparsecoding/inference/vanilla.py
index 11bbfa4..28b7be5 100644
--- a/sparsecoding/inference/vanilla.py
+++ b/sparsecoding/inference/vanilla.py
@@ -4,6 +4,37 @@
 
 
 class Vanilla(InferenceMethod):
+    """
+    Gradient descent with Euler's method on model in Olshausen & Field
+    (1997) with laplace prior over coefficients (corresponding to L1 norm
+    penalty).
+
+    Parameters
+    ----------
+    n_iter : int, default=100
+        Number of iterations to run
+    coeff_lr : float, default=1e-3
+        Update rate of coefficient dynamics
+    sparsity_penalty : float, default=0.2
+
+    stop_early : bool, default=False
+        Stops dynamics early based on change in coefficents
+    epsilon : float, default=1e-2
+        Only used if stop_early True, specifies criteria to stop dynamics
+    return_all_coefficients : str, default=False
+        Returns all coefficients during inference procedure if True
+        User beware: If n_iter is large, setting this parameter to True
+        Can result in large memory usage/potential exhaustion. This
+        function typically used for debugging.
+    solver : default=None
+
+    References
+    ----------
+    [1] Olshausen, B. A., & Field, D. J. (1997). Sparse coding with an
+    overcomplete basis set: A strategy employed by V1?. Vision research,
+    37(23), 3311-3325.
+    """
+
     def __init__(
         self,
         n_iter=100,
@@ -14,35 +45,6 @@ def __init__(
         solver=None,
         return_all_coefficients=False,
     ):
-        """Gradient descent with Euler's method on model in Olshausen & Field
-        (1997) with laplace prior over coefficients (corresponding to l-1 norm
-        penalty).
-
-        Parameters
-        ----------
-        n_iter : int, default=100
-            Number of iterations to run
-        coeff_lr : float, default=1e-3
-            Update rate of coefficient dynamics
-        sparsity_penalty : float, default=0.2
-
-        stop_early : bool, default=False
-            Stops dynamics early based on change in coefficents
-        epsilon : float, default=1e-2
-            Only used if stop_early True, specifies criteria to stop dynamics
-        return_all_coefficients : str, default=False
-            Returns all coefficients during inference procedure if True
-            User beware: If n_iter is large, setting this parameter to True
-            Can result in large memory usage/potential exhaustion. This
-            function typically used for debugging.
-        solver : default=None
-
-        References
-        ----------
-        [1] Olshausen, B. A., & Field, D. J. (1997). Sparse coding with an
-        overcomplete basis set: A strategy employed by V1?. Vision research,
-        37(23), 3311-3325.
-        """
         super().__init__(solver)
         self.coeff_lr = coeff_lr
         self.sparsity_penalty = sparsity_penalty
diff --git a/sparsecoding/models.py b/sparsecoding/models.py
index 5f19db1..1316dbd 100644
--- a/sparsecoding/models.py
+++ b/sparsecoding/models.py
@@ -5,31 +5,31 @@
 
 
 class SparseCoding(torch.nn.Module):
+    """Class for learning a sparse code via dictionary learning
+
+    Parameters
+    ----------
+    inference_method : sparsecoding.InferenceMethod
+        Method for inferring coefficients for each image given the
+        dictionary
+    n_basis : int
+        Number of basis functions in dictionary
+    n_features : int
+        Number of features in data
+    sparsity_penalty : float, default=0.2
+        Sparsity penalty
+    dictionary_lr : float, default=1e-2
+        Learning rate of dictionary update
+    device : torch.device, default=torch.device("cpu")
+        Which device to utilize
+    check_for_dictionary_nan : bool, default=False
+        Flag to check for nans in the dictionary after gradient
+        updates and normalizations. Raises ValueError if nan
+        found
+    """
 
     def __init__(self, inference_method, n_basis, n_features,
                  sparsity_penalty=0.2, device=None, check_for_dictionary_nan=False, **kwargs):
-        """Class for learning a sparse code via dictionary learning
-
-        Parameters
-        ----------
-        inference_method : sparsecoding.InferenceMethod
-            Method for inferring coefficients for each image given the
-            dictionary
-        n_basis : int
-            Number of basis functions in dictionary
-        n_features : int
-            Number of features in data
-        sparsity_penalty : float, default=0.2
-            Sparsity penalty
-        dictionary_lr : float, default=1e-2
-            Learning rate of dictionary update
-        device : torch.device, default=torch.device("cpu")
-            Which device to utilize
-        check_for_dictionary_nan : bool, default=False
-            Flag to check for nans in the dictionary after gradient
-            updates and normalizations. Raises ValueError if nan
-            found
-        """
         super(SparseCoding, self).__init__()
         self.inference_method = inference_method
         self.n_basis = n_basis
diff --git a/sparsecoding/transforms/images.py b/sparsecoding/transforms/images.py
index 9e0f20b..cd48994 100644
--- a/sparsecoding/transforms/images.py
+++ b/sparsecoding/transforms/images.py
@@ -39,12 +39,15 @@ def whiten_images(images: torch.Tensor, algorithm: str, stats: Dict = None, **kw
 
     Parameters
     ----------
-    images: tensor of shape (N, C, H, W)
-    algorithm: what whitening transform we want to use
-    stats: dictionary of dataset statistics needed for whitening transformations
+    images : torch.Tensor
+        Tensor of shape (N, C, H, W)
+    algorithm : str
+        What whitening transform we want to use
+    stats : Dict, default=None
+        Dictionary of dataset statistics needed for whitening transformations
 
     Returns
-    ----------
+    -------
     Tensor of whitened data in shape (N, C, H, W)
     """
 
@@ -72,13 +75,11 @@ def compute_image_whitening_stats(images: torch.Tensor) -> Dict:
 
     Parameters
     ----------
-    images: tensor of shape (N, C, H, W)
-    n_components: Number of principal components to keep. If None, keep all components.
-                  If int, keep that many components. If float between 0 and 1,
-                  keep components that explain that fraction of variance.
+    images : torch.Tensor
+        Tensor of shape (N, C, H, W)
 
     Returns
-    ----------
+    -------
     Dictionary containing whitening statistics (eigenvalues, eigenvectors, mean)
     """
     check_images(images)
@@ -92,11 +93,13 @@ def create_frequency_filter(image_size: int, f0_factor: float = 0.4) -> torch.Te
 
     Parameters
     ----------
-    image_size: Size of the square image
-    f0_factor: Factor for determining the cutoff frequency (default 0.4)
+    image_size : int
+        Size of the square image
+    f0_factor : float, default=0.4
+        Factor for determining the cutoff frequency
 
     Returns
-    ----------
+    -------
     torch.Tensor: Frequency domain filter
     """
     fx = torch.linspace(-image_size / 2, image_size / 2 - 1, image_size)
@@ -117,11 +120,13 @@ def get_cached_filter(image_size: int, f0_factor: float = 0.4) -> torch.Tensor:
 
     Parameters
     ----------
-    image_size: Size of the square image
-    f0_factor: Factor for determining the cutoff frequency
+    image_size : int
+        Size of the square image
+    f0_factor : float, default=0.4
+        Factor for determining the cutoff frequency
 
     Returns
-    ----------
+    -------
     torch.Tensor: Cached frequency domain filter
     """
     return create_frequency_filter(image_size, f0_factor)
@@ -133,11 +138,13 @@ def normalize_variance(tensor: torch.Tensor, target_variance: float = 1.0) -> to
 
     Parameters
     ----------
-    tensor: Input tensor
-    target_variance: Desired variance after normalization
+    tensor : torch.Tensor
+        Input tensor
+    target_variance : float, default=1.0
+        Desired variance after normalization
 
     Returns
-    ----------
+    -------
     torch.Tensor: Normalized tensor
     """
 
@@ -156,12 +163,15 @@ def whiten_channel(channel: torch.Tensor, filt: torch.Tensor, target_variance: f
 
     Parameters
     ----------
-    channel: Single channel image tensor
-    filt: Frequency domain filter
-    target_variance: Target variance for normalization
+    channel : torch.Tensor
+        Single channel image tensor
+    filt : torch.Tensor
+        Frequency domain filter
+    target_variance : float, default=1.0
+        Target variance for normalization
 
     Returns
-    ----------
+    -------
     torch.Tensor: Whitened channel
     """
 
@@ -189,12 +199,15 @@ def frequency_whitening(images: torch.Tensor, target_variance: float = 0.1, f0_f
 
     Parameters
     ----------
-    images: Input images of shape (N, C, H, W)
-    target_variance: Target variance for normalization
-    f0_factor: Factor for determining filter cutoff frequency
+    images : torch.Tensor
+        Input images of shape (N, C, H, W)
+    target_variance : float, default=0.1
+        Target variance for normalization
+    f0_factor : float, default = 0.4
+        Factor for determining filter cutoff frequency
 
     Returns
-    ----------
+    -------
     torch.Tensor: Whitened images
     """
     _, _, H, W = images.shape
@@ -216,19 +229,20 @@ class WhiteningTransform(object):
     """
     A PyTorch transform for image whitening that can be used in a transform pipeline.
     Supports frequency, PCA, and ZCA whitening methods.
+
+    Parameters
+    ----------
+    algorithm : str
+        One of ['frequency', 'pca', 'zca', 'cholesky]
+    stats : Dict or None, default=None
+        Pre-computed statistics for PCA/ZCA whitening
+    compute_stats : bool, default=False
+        If True, will compute stats on first batch seen
+    **kwargs
+        Additional arguments passed to whitening function
     """
 
     def __init__(self, algorithm: str = "zca", stats: Optional[Dict] = None, compute_stats: bool = False, **kwargs):
-        """
-        Initialize whitening transform.
-
-        Parameters
-        ----------
-        algorithm: One of ['frequency', 'pca', 'zca', 'cholesky]
-        stats: Pre-computed statistics for PCA/ZCA whitening
-        compute_stats: If True, will compute stats on first batch seen
-        **kwargs: Additional arguments passed to whitening function
-        """
         self.algorithm = algorithm
         self.stats = stats
         self.compute_stats = compute_stats
@@ -240,11 +254,12 @@ def __call__(self, images: torch.Tensor) -> torch.Tensor:
 
         Parameters
         ----------
-            images: Input images of shape [N, C, H, W] or [C, H, W]
+        images : torch.Tensor
+            Input images of shape [N, C, H, W] or [C, H, W]
 
         Returns
-        ----------
-            Whitened images of same shape as input
+        -------
+        Whitened images of same shape as input
         """
         # Add batch dimension if necessary
         if images.dim() == 3:
@@ -281,10 +296,8 @@ def sample_random_patches(
     num_patches : int
         Number of patches to sample.
     image : Tensor, shape [*, C, H, W]
-        where:
-            C is the number of channels,
-            H is the image height,
-            W is the image width.
+        where: C is the number of channels, H is the image height,
+        W is the image width.
 
     Returns
     -------
@@ -333,10 +346,8 @@ def patchify(
     patch_size : int
         Patch side length.
     image : Tensor, shape [*, C, H, W]
-        where:
-            C is the number of channels,
-            H is the image height,
-            W is the image width.
+        where: C is the number of channels, H is the image height,
+        W is the image width.
     stride : int, optional
         Stride between patches in pixel space. If not specified, set to
         `patch_size` (non-overlapping patches).
@@ -345,10 +356,8 @@ def patchify(
     -------
     patches : Tensor, shape [*, N, C, P, P]
         Non-overlapping patches taken from the input image,
-        where:
-            P is the patch size,
-            N is the number of patches, equal to H//P * W//P,
-            C is the number of channels of the input image.
+        where: P is the patch size, N is the number of patches, equal
+        to H//P * W//P, C is the number of channels of the input image.
     """
     leading_dims = image.shape[:-3]
     C, H, W = image.shape[-3:]
@@ -394,10 +403,8 @@ def quilt(
         Width for the reconstructed image.
     patches : Tensor, shape [*, N, C, P, P]
         Non-overlapping patches from an input image,
-        where:
-            P is the patch size,
-            N is the number of patches,
-            C is the number of channels in the image.
+        where: P is the patch size, N is the number of patches,
+        C is the number of channels in the image.
 
     Returns
     -------
diff --git a/sparsecoding/transforms/whiten.py b/sparsecoding/transforms/whiten.py
index 4206adc..c2ad175 100644
--- a/sparsecoding/transforms/whiten.py
+++ b/sparsecoding/transforms/whiten.py
@@ -9,10 +9,11 @@ def compute_whitening_stats(X: torch.Tensor):
 
     Parameters
     ----------
-    X: Input data of size [N, D]
+    X : torch.Tensor
+        Input data of size [N, D]
 
     Returns
-    ----------
+    -------
     Dictionary containing whitening statistics (eigenvalues, eigenvectors, mean)
     """
 
@@ -43,20 +44,25 @@ def whiten(
 
     Parameters
     ----------
-    X: Input data of shape [N, D] where N are unique data elements of dimensionality D
-    algorithm: Whitening transform we want to apply, one of ['zca', 'pca', or 'cholesky']
-    stats: Dict containing precomputed whitening statistics (mean, eigenvectors, eigenvalues)
-    n_components: Number of principal components to keep. If None, keep all components.
-                  If int, keep that many components. If float between 0 and 1,
-                  keep components that explain that fraction of variance.
-    epsilon: Optional small constant to prevent division by zero
+    X : torch.Tensor
+        Input data of shape [N, D] where N are unique data elements of dimensionality D
+    algorithm : str, default="zca"
+        Whitening transform we want to apply, one of ['zca', 'pca', or 'cholesky']
+    stats : Dict, default=None
+        Dict containing precomputed whitening statistics (mean, eigenvectors, eigenvalues)
+    n_components : float, int, default=None
+        Number of principal components to keep. If None, keep all components.
+        If int, keep that many components. If float between 0 and 1,
+        keep components that explain that fraction of variance.
+    epsilon : float, default=0.0
+        Optional small constant to prevent division by zero
 
     Returns
-    ----------
+    -------
     Whitened data of shape [N, D]
 
     Notes
-    ----------
+    -----
     See examples/Data_Whitening.ipynb for usage examples, and brief discussion about the different whitening methods
 
     See https://arxiv.org/abs/1512.00809 for extensive details on whitening transformations