-
-
Notifications
You must be signed in to change notification settings - Fork 50.4k
Add t stochastic neighbour embedding using Iris dataset #13476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 9 commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
8c581f7
Added t-SNE with Iris dataset example
Khansa435 a394193
Added t-SNE with Iris dataset example
Khansa435 165f516
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 238f54d
Merge branch 'master' into add-tsne-algorithm
Khansa435 fb0fdb4
Updated with descriptive variables
Khansa435 18c5d96
Add descriptive variable names
Khansa435 ef68a5f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] d1a552d
Add Descriptive Variable names
Khansa435 edba972
Merge branch 'add-tsne-algorithm' of https://github.com/Khansa435/Pyt…
Khansa435 6a495bd
Adding Descriptive variable names
Khansa435 7e39675
Update machine_learning/t_stochastic_neighbour_embedding.py
Khansa435 5e60cf2
Update machine_learning/t_stochastic_neighbour_embedding.py
Khansa435 498c137
Improved line formatting
Khansa435 5cb544d
Merge branch 'add-tsne-algorithm' of https://github.com/Khansa435/Pyt…
Khansa435 da78c47
Adding URL for t-SNE Wikipedia
Khansa435 ff974ad
Apply suggestion from @cclauss
cclauss File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| import doctest | ||
| import numpy as np | ||
| from numpy import ndarray | ||
| from sklearn.datasets import load_iris | ||
|
|
||
|
|
||
| def collect_dataset() -> tuple[ndarray, ndarray]: | ||
| """ | ||
| Load the Iris dataset and return features and labels. | ||
|
|
||
| Returns: | ||
| tuple[ndarray, ndarray]: Feature matrix and target labels. | ||
|
|
||
| Example: | ||
| >>> features, targets = collect_dataset() | ||
| >>> features.shape | ||
| (150, 4) | ||
| >>> targets.shape | ||
| (150,) | ||
|
Khansa435 marked this conversation as resolved.
Outdated
|
||
| """ | ||
| iris_dataset = load_iris() | ||
| return np.array(iris_dataset.data), np.array(iris_dataset.target) | ||
|
|
||
|
|
||
| def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray: | ||
| """ | ||
| Compute high-dimensional affinities (P matrix) using a Gaussian kernel. | ||
|
|
||
| Args: | ||
| data_matrix: Input data of shape (n_samples, n_features). | ||
| sigma: Gaussian kernel bandwidth. | ||
|
|
||
| Returns: | ||
| ndarray: Symmetrized probability matrix. | ||
|
|
||
| Example: | ||
| >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) | ||
| >>> probabilities = compute_pairwise_affinities(x) | ||
| >>> float(round(probabilities[0, 1], 3)) | ||
| 0.25 | ||
| """ | ||
| n_samples = data_matrix.shape[0] | ||
| squared_sum = np.sum(np.square(data_matrix), axis=1) | ||
| squared_distance = np.add( | ||
| np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum | ||
| ) | ||
|
|
||
| affinity_matrix = np.exp(-squared_distance / (2 * sigma**2)) | ||
| np.fill_diagonal(affinity_matrix, 0) | ||
|
|
||
| affinity_matrix /= np.sum(affinity_matrix) | ||
| return (affinity_matrix + affinity_matrix.T) / (2 * n_samples) | ||
|
|
||
|
|
||
| def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]: | ||
| """ | ||
| Compute low-dimensional affinities (Q matrix) using a Student-t distribution. | ||
|
|
||
| Args: | ||
| embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components). | ||
|
|
||
| Returns: | ||
| tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix). | ||
|
|
||
| Example: | ||
| >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) | ||
| >>> q_matrix, numerators = compute_low_dim_affinities(y) | ||
| >>> q_matrix.shape | ||
| (2, 2) | ||
| """ | ||
| squared_sum = np.sum(np.square(embedding_matrix), axis=1) | ||
| numerator_matrix = 1 / ( | ||
| 1 | ||
| + np.add( | ||
| np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T, | ||
| squared_sum, | ||
| ) | ||
| ) | ||
| np.fill_diagonal(numerator_matrix, 0) | ||
|
|
||
| q_matrix = numerator_matrix / np.sum(numerator_matrix) | ||
| return q_matrix, numerator_matrix | ||
|
|
||
|
|
||
| def apply_tsne( | ||
| data_matrix: ndarray, | ||
| n_components: int = 2, | ||
| learning_rate: float = 200.0, | ||
| n_iter: int = 500, | ||
| ) -> ndarray: | ||
| """ | ||
| Apply t-SNE for dimensionality reduction. | ||
|
|
||
| Args: | ||
| data_matrix: Original dataset (features). | ||
| n_components: Target dimension (2D or 3D). | ||
| learning_rate: Step size for gradient descent. | ||
| n_iter: Number of iterations. | ||
|
|
||
| Returns: | ||
| ndarray: Low-dimensional embedding of the data. | ||
|
|
||
| Example: | ||
| >>> features, _ = collect_dataset() | ||
| >>> embedding = apply_tsne(features, n_components=2, n_iter=50) | ||
| >>> embedding.shape | ||
| (150, 2) | ||
| """ | ||
| if n_components < 1 or n_iter < 1: | ||
| raise ValueError("n_components and n_iter must be >= 1") | ||
|
|
||
| n_samples = data_matrix.shape[0] | ||
| rng = np.random.default_rng() | ||
| embedding = rng.standard_normal((n_samples, n_components)) * 1e-4 | ||
|
|
||
| high_dim_affinities = compute_pairwise_affinities(data_matrix) | ||
| high_dim_affinities = np.maximum(high_dim_affinities, 1e-12) | ||
|
|
||
| embedding_increment = np.zeros_like(embedding) | ||
| momentum = 0.5 | ||
|
|
||
| for iteration in range(n_iter): | ||
| low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding) | ||
| low_dim_affinities = np.maximum(low_dim_affinities, 1e-12) | ||
|
|
||
| affinity_diff = high_dim_affinities - low_dim_affinities | ||
|
|
||
| gradient = 4 * ( | ||
| np.dot((affinity_diff * numerator_matrix), embedding) | ||
| - np.multiply( | ||
| np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis], | ||
| embedding, | ||
| ) | ||
| ) | ||
|
|
||
| embedding_increment = momentum * embedding_increment - learning_rate * gradient | ||
| embedding += embedding_increment | ||
|
|
||
| if iteration == int(n_iter / 4): | ||
| momentum = 0.8 | ||
|
|
||
| return embedding | ||
|
|
||
|
|
||
| def main() -> None: | ||
| """ | ||
| Run t-SNE on the Iris dataset and display the first 5 embeddings. | ||
|
|
||
| Example: | ||
| >>> main() # doctest: +ELLIPSIS | ||
| t-SNE embedding (first 5 points): | ||
| [[... | ||
| """ | ||
| features, _labels = collect_dataset() | ||
| embedding = apply_tsne(features, n_components=2, n_iter=300) | ||
|
|
||
| if not isinstance(embedding, np.ndarray): | ||
| raise TypeError("t-SNE embedding must be an ndarray") | ||
|
|
||
| print("t-SNE embedding (first 5 points):") | ||
| print(embedding[:5]) | ||
|
|
||
| # Optional visualization (Ruff/mypy compliant) | ||
|
|
||
| # import matplotlib.pyplot as plt | ||
| # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") | ||
| # plt.title("t-SNE Visualization of the Iris Dataset") | ||
| # plt.xlabel("Dimension 1") | ||
| # plt.ylabel("Dimension 2") | ||
| # plt.show() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| doctest.testmod() | ||
| main() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.