diff --git a/pyproject.toml b/pyproject.toml index 5be781448..e7025cd35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,10 @@ scrapy = [ "scrapy>=2.11.0,<3", "twisted==22.10.0", ] +sharepoint = [ + "msal>=1.20.0", + "pandas>=2.0.0", +] [tool.uv] default-groups = [ @@ -113,6 +117,7 @@ default-groups = [ "airtable", "filesystem", "scrapy", + "sharepoint", ] # [tool.uv.sources] diff --git a/sources/sharepoint/README.md b/sources/sharepoint/README.md new file mode 100644 index 000000000..dc2e4afb0 --- /dev/null +++ b/sources/sharepoint/README.md @@ -0,0 +1,256 @@ +# SharePoint Source + +This source allows you to extract data from SharePoint lists and files using the Microsoft Graph API. + +## Features + +- Extract data from SharePoint lists +- Download and process files from SharePoint document libraries +- Support for multiple file formats (CSV, Excel, JSON, Parquet, SAS, SPSS) +- Incremental loading support for files based on modification time +- Flexible file filtering with regex patterns + +## Prerequisites + +Before using this source, you need: + +1. **Azure AD Application Registration** with the following: + - Client ID + - Tenant ID + - Client Secret + - Microsoft Graph API permissions: + - `Sites.Read.All` or `Sites.ReadWrite.All` + - `Files.Read.All` (for file operations) + +2. **SharePoint Site ID**: The unique identifier for your SharePoint site + +## Configuration + +### Credentials + +Configure your credentials in `secrets.toml`: + +```toml +[sources.sharepoint] +client_id = "your-client-id" +tenant_id = "your-tenant-id" +site_id = "your-site-id" +client_secret = "your-client-secret" +sub_site_id = "" # Optional: for sub-sites +``` + +### SharePoint List Configuration + +```python +from sharepoint.sharepoint_files_config import SharepointListConfig + +list_config = SharepointListConfig( + table_name="my_list_data", + list_title="My SharePoint List", + select="Title,Description,Status", # Optional: specific fields + is_incremental=False # Incremental not yet implemented +) +``` + +### SharePoint Files Configuration + +```python +from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType + +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents/Reports", + table_name="reports_data", + file_name_startswith="report_", + pattern=r".*\.csv$", # Optional: regex pattern for filtering + pandas_kwargs={"sep": ","}, # Optional: pandas read options + is_file_incremental=True # Enable incremental loading +) +``` + +## Usage Examples + +### Example 1: Load SharePoint List Data + +```python +import dlt +from sharepoint import sharepoint_list, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointListConfig + +# Configure credentials +credentials = SharepointCredentials() + +# Configure list extraction +list_config = SharepointListConfig( + table_name="tasks", + list_title="Project Tasks" +) + +# Create and run pipeline +pipeline = dlt.pipeline( + pipeline_name="sharepoint_list", + destination="duckdb", + dataset_name="sharepoint_data" +) + +load_info = pipeline.run( + sharepoint_list( + sharepoint_list_config=list_config, + credentials=credentials + ) +) +print(load_info) +``` + +### Example 2: Load Files from SharePoint + +```python +import dlt +from sharepoint import sharepoint_files, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType + +# Configure credentials +credentials = SharepointCredentials() + +# Configure file extraction +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Shared Documents/Reports", + table_name="monthly_reports", + file_name_startswith="report_", + pattern=r"202[4-5].*\.csv$", + is_file_incremental=True, + pandas_kwargs={"sep": ",", "encoding": "utf-8"} +) + +# Create and run pipeline +pipeline = dlt.pipeline( + pipeline_name="sharepoint_files", + destination="duckdb", + dataset_name="sharepoint_data" +) + +load_info = pipeline.run( + sharepoint_files( + sharepoint_files_config=files_config, + credentials=credentials + ) +) +print(load_info) +``` + +### Example 3: Process Excel Files with Chunking + +```python +files_config = SharepointFilesConfig( + file_type=FileType.EXCEL, + folder_path="Reports/Annual", + table_name="large_report", + file_name_startswith="annual_", + pandas_kwargs={ + "sheet_name": "Data", + "chunksize": 1000 # Process in chunks of 1000 rows + } +) +``` + +## Supported File Types + +The source supports the following file types via pandas: + +- `FileType.CSV` - CSV files +- `FileType.EXCEL` - Excel files (.xlsx, .xls) +- `FileType.JSON` - JSON files +- `FileType.PARQUET` - Parquet files +- `FileType.SAS` - SAS files +- `FileType.SPSS` - SPSS files + +## Incremental Loading + +### File Incremental Loading + +When `is_file_incremental=True`, the source tracks the `lastModifiedDateTime` of files and only processes files that have been modified since the last run. + +```python +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="data", + file_name_startswith="data_", + is_file_incremental=True # Only process new/modified files +) +``` + +### List Incremental Loading + +Incremental loading for SharePoint lists is not yet implemented. + +## Advanced Configuration + +### Folder Path Validation + +Folder paths are automatically normalized: +- Leading/trailing slashes are removed +- Double slashes are not allowed +- Only alphanumeric characters, dashes, underscores, spaces, and dots are allowed + +### Pattern Matching + +The `pattern` parameter is automatically prefixed with `file_name_startswith`. For example: + +```python +files_config = SharepointFilesConfig( + file_name_startswith="report_", + pattern=r"\d{8}\.csv$" +) +# Effective pattern: ^report_\d{8}\.csv$ +``` + +### Pandas Kwargs + +Any pandas read function parameters can be passed via `pandas_kwargs`: + +```python +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="data", + file_name_startswith="data", + pandas_kwargs={ + "sep": ";", + "encoding": "latin1", + "decimal": ",", + "chunksize": 5000 + } +) +``` + +## Troubleshooting + +### Authentication Issues + +If you encounter authentication errors: +1. Verify your Client ID, Tenant ID, and Client Secret are correct +2. Ensure your Azure AD app has the required permissions +3. Check that admin consent has been granted for the permissions + +### File Not Found + +If files are not being found: +1. Verify the folder path is correct (case-sensitive) +2. Check that the file name pattern matches your files +3. Ensure your app has access to the SharePoint site and folder + +### Permission Errors + +Ensure your Azure AD application has been granted: +- `Sites.Read.All` or `Sites.ReadWrite.All` +- `Files.Read.All` + +And that admin consent has been provided for these permissions. + +## Resources + +- [Microsoft Graph API Documentation](https://learn.microsoft.com/en-us/graph/api/overview) +- [SharePoint REST API](https://learn.microsoft.com/en-us/sharepoint/dev/sp-add-ins/get-to-know-the-sharepoint-rest-service) +- [Azure AD App Registration](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py new file mode 100644 index 000000000..b61d84fe2 --- /dev/null +++ b/sources/sharepoint/__init__.py @@ -0,0 +1,172 @@ +"""SharePoint data source for dlt. + +Provides sources for extracting data from SharePoint lists and files +using the Microsoft Graph API. +""" +from typing import Iterator, Dict +import re + +import dlt +from dlt.common.typing import TDataItems +from dlt.common.configuration.specs import configspec, BaseConfiguration +from dlt.common import logger +import pandas as pd + +from .helpers import SharepointClient +from .sharepoint_files_config import SharepointFilesConfig, SharepointListConfig + + +@configspec +class SharepointCredentials(BaseConfiguration): + """Credentials for SharePoint authentication via Azure AD. + + Attributes: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + """ + client_id: str = None + tenant_id: str = None + site_id: str = None + client_secret: str = None + sub_site_id: str = "" + + +@dlt.source(name="sharepoint_list", max_table_nesting=0) +def sharepoint_list( + sharepoint_list_config: SharepointListConfig, + credentials: SharepointCredentials = dlt.secrets.value, +) -> Iterator[Dict[str, str]]: + """Extract data from a SharePoint list. + + This source connects to SharePoint using Microsoft Graph API and retrieves + items from a specified list. + + Args: + sharepoint_list_config: Configuration for the SharePoint list extraction + credentials: SharePoint authentication credentials + + Yields: + DLT resource containing SharePoint list items + + Example: + >>> config = SharepointListConfig( + ... table_name="tasks", + ... list_title="Project Tasks" + ... ) + >>> source = sharepoint_list(config) + >>> pipeline.run(source) + """ + client: SharepointClient = SharepointClient(**credentials) + client.connect() + logger.info(f"Connected to SharePoint site: {client.site_info}") + + def get_pipe(sharepoint_list_config: SharepointListConfig): + def get_records(sharepoint_list_config: SharepointListConfig): + data = client.get_items_from_list( + list_title=sharepoint_list_config.list_title, + select=sharepoint_list_config.select, + ) + yield from data + + return dlt.resource(get_records, name=sharepoint_list_config.table_name)( + sharepoint_list_config + ) + + yield get_pipe(sharepoint_list_config=sharepoint_list_config) + + +@dlt.source(name="sharepoint_files", max_table_nesting=0) +def sharepoint_files( + sharepoint_files_config: SharepointFilesConfig, + credentials: SharepointCredentials = dlt.secrets.value, +): + """Extract and process files from SharePoint document libraries. + + This source downloads files from SharePoint based on the configuration, + processes them using pandas, and yields the data for loading. + + Supports incremental loading based on file modification time. + + Args: + sharepoint_files_config: Configuration for file extraction and processing + credentials: SharePoint authentication credentials + + Yields: + DLT resource containing processed file data + + Example: + >>> config = SharepointFilesConfig( + ... file_type=FileType.CSV, + ... folder_path="Documents/Reports", + ... table_name="reports", + ... file_name_startswith="report_" + ... ) + >>> source = sharepoint_files(config) + >>> pipeline.run(source) + """ + client: SharepointClient = SharepointClient(**credentials) + client.connect() + logger.info(f"Connected to SharePoint site: {client.site_info}") + + def get_files( + sharepoint_files_config: SharepointFilesConfig, + last_update_timestamp: dlt.sources.incremental = dlt.sources.incremental( + cursor_path="lastModifiedDateTime", + initial_value="2020-01-01T00:00:00Z", + primary_key=(), + ), + ): + current_last_value = last_update_timestamp.last_value + logger.debug(f"current_last_value: {current_last_value}") + for file_item in client.get_files_from_path( + folder_path=sharepoint_files_config.folder_path, + file_name_startswith=sharepoint_files_config.file_name_startswith, + pattern=sharepoint_files_config.pattern, + ): + logger.debug( + "filtering files based on lastModifiedDateTime, compare to last_value:" + f" {current_last_value}" + ) + if ( + file_item["lastModifiedDateTime"] > current_last_value + or not sharepoint_files_config.is_file_incremental + ): + logger.info( + f"Processing file after lastModifiedDateTime filter: {file_item['name']}" + ) + + file_item["pd_function"] = ( + sharepoint_files_config.file_type.get_pd_function() + ) + file_item["pd_kwargs"] = sharepoint_files_config.pandas_kwargs + yield file_item + else: + logger.info( + f"Skipping file {file_item['name']} based on lastModifiedDateTime filter" + ) + + def get_records(file_item: Dict) -> TDataItems: + chunksize = file_item["pd_kwargs"].get("chunksize", None) + file_io = client.get_file_bytes_io(file_item=file_item) + + if chunksize: + with file_item["pd_function"](file_io, **file_item["pd_kwargs"]) as reader: + for num, chunk in enumerate(reader): + logger.info(f"Processing chunk {num} of {file_item['name']}") + yield chunk + else: + df = file_item["pd_function"](file_io, **file_item["pd_kwargs"]) + yield df + logger.debug(f"get_records done for {file_item['name']}") + + def get_pipe(sharepoint_files_config: SharepointFilesConfig): + return dlt.resource( + get_files, name=f"{sharepoint_files_config.table_name}_files" + )(sharepoint_files_config) | dlt.transformer( + get_records, name=sharepoint_files_config.table_name, parallelized=False + ) + + yield get_pipe(sharepoint_files_config=sharepoint_files_config) diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py new file mode 100644 index 000000000..43aed4ff4 --- /dev/null +++ b/sources/sharepoint/helpers.py @@ -0,0 +1,265 @@ +"""Helper module for SharePoint data extraction using Microsoft Graph API.""" +from typing import Dict, List +from io import BytesIO +import re + +from msal import ConfidentialClientApplication +from dlt.common import logger +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator + + +class SharepointClient: + """Client for interacting with SharePoint via Microsoft Graph API. + + This client handles authentication and provides methods to retrieve lists, + list items, and files from SharePoint sites. + + Attributes: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + graph_api_url: Base URL for Microsoft Graph API + graph_site_url: Full URL for the specific SharePoint site + client: REST client instance (set after connect()) + """ + + def __init__( + self, + client_id: str, + tenant_id: str, + site_id: str, + client_secret: str, + sub_site_id: str = "", + ) -> None: + """Initialize SharePoint client with credentials. + + Args: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + + Raises: + ValueError: If any required credentials are missing + """ + self.client_id = client_id + self.tenant_id = tenant_id + self.client_secret = client_secret + self.sub_site_id = sub_site_id + self.site_id = site_id + if not all([self.client_id, self.tenant_id, self.client_secret, self.site_id]): + raise ValueError( + "client_id, tenant_id, client_secret and site_id are required for connect to" + " SharePoint" + ) + self.graph_api_url = "https://graph.microsoft.com/v1.0/sites" + self.graph_site_url = f"{self.graph_api_url}/{self.site_id}" + if self.sub_site_id: + self.graph_site_url += f"/sites/{self.sub_site_id}" + + def connect(self) -> None: + """Establish connection to SharePoint using MSAL authentication. + + Acquires an access token using client credentials flow and initializes + the REST client with bearer token authentication. + + Raises: + ConnectionError: If authentication fails or access token cannot be obtained + """ + authority = f"https://login.microsoftonline.com/{self.tenant_id}" + scope = ["https://graph.microsoft.com/.default"] + + app = ConfidentialClientApplication( + self.client_id, + authority=authority, + client_credential=self.client_secret, + ) + + # Get the access token + token_response = app.acquire_token_for_client(scopes=scope) + access_token = token_response.get("access_token", None) + + if access_token: + self.client = RESTClient( + base_url=self.graph_site_url, + auth=BearerTokenAuth(access_token), + paginator=JSONLinkPaginator(next_url_path="@odata.nextLink"), + ) + logger.info(f"Connected to SharePoint site id: {self.site_id} successfully") + else: + raise ConnectionError("Connection failed : ", token_response) + + @property + def sub_sites(self) -> List: + """Get list of sub-sites within the current SharePoint site. + + Returns: + List of sub-site information dictionaries + """ + url = f"{self.graph_site_url}/sites" + response = self.client.get(url) + site_info = response.json() + if "value" in site_info: + return site_info["value"] + else: + logger.warning(f"No subsite found in {url}") + + @property + def site_info(self) -> Dict: + """Get information about the current SharePoint site. + + Returns: + Dictionary containing site metadata and properties + """ + url = f"{self.graph_site_url}" + response = self.client.get(url) + site_info = response.json() + if not "error" in site_info: + return site_info + else: + logger.warning(f"No site_info found in {url}") + + def get_all_lists_in_site(self) -> List[Dict]: + """Retrieve all generic lists from the SharePoint site. + + Filters for lists with template type 'genericList' and 'Lists' in their URL, + excluding document libraries and other non-list items. + + Returns: + List of dictionaries containing list metadata + """ + url = f"{self.graph_site_url}/lists" + res = self.client.get(url) + res.raise_for_status() + lists_info = res.json() + if "value" in lists_info: + all_items = lists_info["value"] + filtered_lists = [ + item + for item in all_items + if item.get("list", {}).get("template") == "genericList" + and "Lists" in item.get("webUrl", "") + ] + return filtered_lists + else: + filtered_lists = [] + if not filtered_lists: + logger.warning(f"No lists found in {url}") + return filtered_lists + + def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]: + """Retrieve items from a specific SharePoint list. + + Note: Pagination is not yet implemented; only the first page is returned. + + Args: + list_title: Display name of the SharePoint list + select: Optional comma-separated string of field names to retrieve + + Returns: + List of dictionaries containing list item field values + + Raises: + ValueError: If the specified list is not found in the site + """ + # TODO, pagination not yet implemented + logger.warning( + "Pagination is not implemented for get_items_from_list, " + "it will return only first page of items." + ) + all_lists = self.get_all_lists_in_site() + filtered_lists = [ + x + for x in all_lists + if x.get("list", {}).get("template") == "genericList" + and "Lists" in x.get("webUrl", "") + ] + + possible_list_titles = [x["displayName"] for x in filtered_lists] + if list_title not in possible_list_titles: + raise ValueError( + f"List with title '{list_title}' not found in site {self.site_id}. " + f"Available lists: {possible_list_titles}" + ) + + # Get the list ID + list_id = next( + x["id"] for x in filtered_lists if x["displayName"] == list_title + ) + + url = f"{self.graph_site_url}/lists/{list_id}/items?expand=fields" + if select: + url += f"(select={select})" + res = self.client.get(url) + res.raise_for_status() + items_info = res.json() + + if "value" in items_info: + output = [x.get("fields", {}) for x in items_info["value"]] + else: + output = [] + if output: + logger.info(f"Got {len(output)} items from list: {list_title}") + return output + else: + logger.warning( + f"No items found in list: {list_title}, with select: {select}" + ) + + def get_files_from_path( + self, folder_path: str, file_name_startswith: str, pattern: str = None + ) -> Dict: + """Get files from a SharePoint folder matching specified criteria. + + Args: + folder_path: Path to the folder within SharePoint (e.g., 'Documents/Reports') + file_name_startswith: Prefix that file names must start with + pattern: Optional regex pattern for additional filtering + + Returns: + List of file item dictionaries containing metadata and download URLs + """ + folder_url = ( + f"{self.graph_site_url}/drive/root:/{folder_path}:/children?$filter=startswith(name," + f" '{file_name_startswith}')" + ) + logger.debug(f"Getting files from folder with endpoint: {folder_url}") + res = self.client.get(folder_url) + file_and_folder_items = res.json().get("value", []) + file_items = [x for x in file_and_folder_items if "file" in x.keys()] + if pattern: + logger.debug(f"Filtering files with pattern: {pattern}") + file_items = [x for x in file_items if re.search(pattern, x["name"])] + + logger.debug(f"Got number files from ms graph api: {len(file_items)}") + return file_items + + def get_file_bytes_io(self, file_item: Dict): + """Download a SharePoint file to a BytesIO object. + + Args: + file_item: File metadata dictionary containing '@microsoft.graph.downloadUrl' + + Returns: + BytesIO object containing the file contents + + Raises: + FileNotFoundError: If the file cannot be downloaded + """ + file_url = file_item["@microsoft.graph.downloadUrl"] + response = self.client.get(file_url) + if response.status_code == 200: + bytes_io = BytesIO(response.content) + logger.info( + f"File {file_item['name']} downloaded to BytesIO, size: {len(bytes_io.getvalue())}" + ) + return bytes_io + else: + raise FileNotFoundError( + f"File not found: {file_item['name']} or can't be downloaded" + ) diff --git a/sources/sharepoint/requirements.txt b/sources/sharepoint/requirements.txt new file mode 100644 index 000000000..e7fbee22e --- /dev/null +++ b/sources/sharepoint/requirements.txt @@ -0,0 +1,3 @@ +msal>=1.20.0 +pandas>=2.0.0 +dlt>=0.5.1 diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py new file mode 100644 index 000000000..45ef8934c --- /dev/null +++ b/sources/sharepoint/sharepoint_files_config.py @@ -0,0 +1,133 @@ +"""Configuration classes for SharePoint data extraction. + +Provides configuration models for SharePoint lists and files, +including file type definitions and validation utilities. +""" +from typing import Optional, Dict +import re +from enum import Enum + +import pandas as pd +from pydantic import BaseModel + + +class FileType(Enum): + """Supported file types for SharePoint file extraction. + + Each file type maps to a corresponding pandas read function. + + Attributes: + EXCEL: Excel files (.xlsx, .xls) + CSV: Comma-separated values files + JSON: JSON format files + PARQUET: Apache Parquet files + SAS: SAS data files + SPSS: SPSS data files + SAV: SPSS SAV format files + """ + EXCEL = "excel" + CSV = "csv" + JSON = "json" + PARQUET = "parquet" + SAS = "sas" + SPSS = "spss" + SAV = "sav" + + def get_pd_function(self): + """Get the pandas read function for this file type. + + Returns: + Callable pandas read function (e.g., pd.read_csv, pd.read_excel) + """ + return { + self.EXCEL: pd.read_excel, + self.CSV: pd.read_csv, + self.JSON: pd.read_json, + self.PARQUET: pd.read_parquet, + self.SAS: pd.read_sas, + self.SPSS: pd.read_spss, + }[self] + + +class SharepointListConfig(BaseModel): + """Configuration for SharePoint list data extraction. + + Attributes: + table_name: Name of the destination table for the list data + list_title: Display name of the SharePoint list to extract + select: Optional comma-separated field names to retrieve + is_incremental: Enable incremental loading (not yet implemented) + + Raises: + NotImplementedError: If is_incremental is set to True + """ + table_name: str + list_title: str + select: Optional[str] = None + is_incremental: Optional[bool] = False + + def __init__(self, **data): + super().__init__(**data) + if self.is_incremental is True: + raise NotImplementedError( + "Incremental loading for Sharepoint List is not implemented yet." + ) + + +class SharepointFilesConfig(BaseModel): + """Configuration for SharePoint file extraction and processing. + + Attributes: + file_type: Type of files to process (CSV, Excel, etc.) + folder_path: Path to the SharePoint folder containing files + table_name: Name of the destination table for file data + file_name_startswith: Prefix filter for file names + pattern: Optional regex pattern for additional file filtering + pandas_kwargs: Additional arguments to pass to pandas read function + is_file_incremental: Enable incremental loading based on file modification time + + Note: + The pattern attribute is automatically prefixed with file_name_startswith. + Folder paths are validated and normalized during initialization. + """ + file_type: FileType + folder_path: str + table_name: str + file_name_startswith: str + pattern: Optional[str] = ".*" + pandas_kwargs: Dict = {} + is_file_incremental: bool = False + + def __init__(self, **data): + super().__init__(**data) + self.folder_path = validate_folder_path(self.folder_path) + self.pattern = f"^{self.file_name_startswith}{self.pattern}" + + +def validate_folder_path(folder_path: str) -> str: + """Validate and normalize a SharePoint folder path. + + Removes leading/trailing slashes and validates that the path contains + only allowed characters (alphanumeric, dashes, underscores, spaces, dots). + + Args: + folder_path: The folder path to validate + + Returns: + Normalized folder path without leading/trailing slashes + + Raises: + ValueError: If path contains invalid characters or double slashes + """ + if folder_path.startswith("/"): + folder_path = folder_path[1:] + if folder_path.endswith("/"): + folder_path = folder_path[:-1] + if not re.compile(r"^[a-zA-Z0-9_\-/\s\.]*$").match(folder_path): + raise ValueError( + "Invalid folder path, only alphanumeric characters, dashes and underscores are" + f" allowed: {folder_path}" + ) + if re.compile(r"//").search(folder_path): + raise ValueError(f"Invalid folder path with double slashes: {folder_path}") + return folder_path diff --git a/sources/sharepoint_pipeline.py b/sources/sharepoint_pipeline.py new file mode 100644 index 000000000..0e63653ff --- /dev/null +++ b/sources/sharepoint_pipeline.py @@ -0,0 +1,61 @@ + +import dlt +from sharepoint import sharepoint_list, sharepoint_files, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointFilesConfig, SharepointListConfig + +if __name__ == "__main__": + # --- 1. Define SharePoint credentials --- + credentials = SharepointCredentials( + client_id="your-client-id", + tenant_id="your-tenant-id", + site_id="your-site-id", + client_secret="your-client-secret", + sub_site_id="" + ) + + # --- 2. Configure SharePoint list extraction --- + list_config = SharepointListConfig( + list_title="test_list", + select="Title,ins", + table_name="sharepoint_list_table" + ) + + # --- 3. Configure SharePoint file extraction --- + files_config = SharepointFilesConfig( + folder_path="General/sharepoint_test", + file_name_startswith="test_", + pattern=r".*\.csv$", + file_type="csv", + table_name="sharepoint_reports", + is_file_incremental=True, + pandas_kwargs={} + ) + + # --- 4. Create the DLT pipeline (destination = DuckDB) --- + pipeline = dlt.pipeline( + pipeline_name="sharepoint_to_duckdb", + destination="duckdb", + dataset_name="sharepoint_data", + full_refresh=False + ) + + # --- 5. Run both sources and load to DuckDB --- + print("Loading SharePoint List data...") + list_load_info = pipeline.run( + sharepoint_list(sharepoint_list_config=list_config, credentials=credentials) + ) + print(list_load_info) + with pipeline.sql_client() as client: + df = client.execute("SELECT * FROM sharepoint_list_table LIMIT 10").df() + print(df) + + + print("Loading SharePoint Files data...") + files_load_info = pipeline.run( + sharepoint_files(sharepoint_files_config=files_config, credentials=credentials) + ) + print(files_load_info) + + with pipeline.sql_client() as client: + df = client.execute("SELECT * FROM sharepoint_reports LIMIT 10").df() + print(df) diff --git a/tests/sharepoint/__init__.py b/tests/sharepoint/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sharepoint/test_sharepoint_source.py b/tests/sharepoint/test_sharepoint_source.py new file mode 100644 index 000000000..492bdd456 --- /dev/null +++ b/tests/sharepoint/test_sharepoint_source.py @@ -0,0 +1,654 @@ +import pytest +from unittest.mock import Mock, patch, MagicMock +from io import BytesIO +import pandas as pd +from typing import Dict, List + +import dlt +from sources.sharepoint import ( + sharepoint_list, + sharepoint_files, + SharepointCredentials, +) +from sources.sharepoint.sharepoint_files_config import ( + SharepointFilesConfig, + SharepointListConfig, + FileType, + validate_folder_path, +) +from sources.sharepoint.helpers import SharepointClient + +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + load_table_counts, +) + + +# Mock credentials for testing +MOCK_CREDENTIALS = { + "client_id": "test_client_id", + "tenant_id": "test_tenant_id", + "site_id": "test_site_id", + "client_secret": "test_client_secret", + "sub_site_id": "", +} + + +class TestSharepointFilesConfig: + """Test SharepointFilesConfig class""" + + def test_valid_config(self): + """Test creating a valid SharepointFilesConfig""" + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="/Documents/Reports", + table_name="test_table", + file_name_startswith="report", + pattern=r".*\.csv$", + pandas_kwargs={"sep": ","}, + is_file_incremental=True, + ) + assert config.file_type == FileType.CSV + assert config.folder_path == "Documents/Reports" + assert config.table_name == "test_table" + assert config.file_name_startswith == "report" + assert config.pattern == r"^report.*\.csv$" + assert config.pandas_kwargs == {"sep": ","} + assert config.is_file_incremental is True + + def test_folder_path_normalization(self): + """Test that folder paths are normalized correctly""" + config = SharepointFilesConfig( + file_type=FileType.EXCEL, + folder_path="/Documents/", + table_name="test_table", + file_name_startswith="file", + ) + assert config.folder_path == "Documents" + + def test_pattern_prefix(self): + """Test that pattern is prefixed with file_name_startswith""" + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="report_", + pattern=r"\d{8}\.csv$", + ) + assert config.pattern == r"^report_\d{8}\.csv$" + + def test_get_pd_function(self): + """Test that get_pd_function returns correct pandas functions""" + assert FileType.CSV.get_pd_function() == pd.read_csv + assert FileType.EXCEL.get_pd_function() == pd.read_excel + assert FileType.JSON.get_pd_function() == pd.read_json + assert FileType.PARQUET.get_pd_function() == pd.read_parquet + + +class TestSharepointListConfig: + """Test SharepointListConfig class""" + + def test_valid_config(self): + """Test creating a valid SharepointListConfig""" + config = SharepointListConfig( + table_name="test_table", + list_title="Test List", + select="field1,field2", + is_incremental=False, + ) + assert config.table_name == "test_table" + assert config.list_title == "Test List" + assert config.select == "field1,field2" + assert config.is_incremental is False + + def test_incremental_not_implemented(self): + """Test that incremental loading raises NotImplementedError""" + with pytest.raises(NotImplementedError): + SharepointListConfig( + table_name="test_table", + list_title="Test List", + is_incremental=True, + ) + + +class TestValidateFolderPath: + """Test validate_folder_path function""" + + def test_remove_leading_slash(self): + """Test that leading slashes are removed""" + assert validate_folder_path("/Documents") == "Documents" + + def test_remove_trailing_slash(self): + """Test that trailing slashes are removed""" + assert validate_folder_path("Documents/") == "Documents" + + def test_remove_both_slashes(self): + """Test that both leading and trailing slashes are removed""" + assert validate_folder_path("/Documents/") == "Documents" + + def test_valid_path_with_subdirs(self): + """Test valid path with subdirectories""" + assert validate_folder_path("Documents/Reports/2024") == "Documents/Reports/2024" + + def test_valid_path_with_spaces(self): + """Test valid path with spaces""" + assert validate_folder_path("My Documents/My Reports") == "My Documents/My Reports" + + def test_invalid_characters(self): + """Test that invalid characters raise ValueError""" + with pytest.raises(ValueError, match="Invalid folder path"): + validate_folder_path("Documents/Reports@2024") + + def test_double_slashes(self): + """Test that double slashes raise ValueError""" + with pytest.raises(ValueError, match="Invalid folder path with double slashes"): + validate_folder_path("Documents//Reports") + + +class TestSharepointClient: + """Test SharepointClient class""" + + def test_client_initialization(self): + """Test SharepointClient initialization""" + client = SharepointClient(**MOCK_CREDENTIALS) + assert client.client_id == MOCK_CREDENTIALS["client_id"] + assert client.tenant_id == MOCK_CREDENTIALS["tenant_id"] + assert client.site_id == MOCK_CREDENTIALS["site_id"] + assert client.client_secret == MOCK_CREDENTIALS["client_secret"] + assert client.sub_site_id == "" + assert client.graph_site_url == f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}" + + def test_client_with_subsite(self): + """Test SharepointClient initialization with sub_site_id""" + credentials = MOCK_CREDENTIALS.copy() + credentials["sub_site_id"] = "sub_site_123" + client = SharepointClient(**credentials) + expected_url = f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}/sites/sub_site_123" + assert client.graph_site_url == expected_url + + def test_client_missing_credentials(self): + """Test that missing credentials raise ValueError""" + with pytest.raises(ValueError, match="client_id, tenant_id, client_secret and site_id are required"): + SharepointClient( + client_id="", + tenant_id="test", + site_id="test", + client_secret="test", + ) + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_connect_success(self, mock_rest_client, mock_msal_app): + """Test successful connection to SharePoint""" + # Mock MSAL token response + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = { + "access_token": "test_access_token" + } + mock_msal_app.return_value = mock_app_instance + + # Mock REST client + mock_client_instance = Mock() + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + assert client.client is not None + mock_msal_app.assert_called_once() + mock_rest_client.assert_called_once() + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + def test_connect_failure(self, mock_msal_app): + """Test failed connection to SharePoint""" + # Mock MSAL token response without access_token + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"error": "authentication_failed"} + mock_msal_app.return_value = mock_app_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + with pytest.raises(ConnectionError, match="Connection failed"): + client.connect() + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_all_lists_in_site(self, mock_rest_client, mock_msal_app): + """Test getting all lists from a site""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List 1", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList1", + "list": {"template": "genericList"}, + }, + { + "id": "list2", + "displayName": "Test List 2", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList2", + "list": {"template": "genericList"}, + }, + { + "id": "list3", + "displayName": "Document Library", + "webUrl": "https://test.sharepoint.com/sites/test/Shared Documents", + "list": {"template": "documentLibrary"}, + }, + ] + } + mock_response.raise_for_status = Mock() + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + lists = client.get_all_lists_in_site() + + assert len(lists) == 2 + assert all(item["list"]["template"] == "genericList" for item in lists) + assert all("Lists" in item["webUrl"] for item in lists) + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_items_from_list(self, mock_rest_client, mock_msal_app): + """Test getting items from a specific list""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + + # Mock response for get_all_lists_in_site + mock_lists_response = Mock() + mock_lists_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList", + "list": {"template": "genericList"}, + } + ] + } + mock_lists_response.raise_for_status = Mock() + + # Mock response for list items + mock_items_response = Mock() + mock_items_response.json.return_value = { + "value": [ + {"fields": {"Title": "Item 1", "Description": "Test item 1"}}, + {"fields": {"Title": "Item 2", "Description": "Test item 2"}}, + ] + } + mock_items_response.raise_for_status = Mock() + + mock_client_instance.get.side_effect = [mock_lists_response, mock_items_response] + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + items = client.get_items_from_list("Test List") + + assert len(items) == 2 + assert items[0]["Title"] == "Item 1" + assert items[1]["Title"] == "Item 2" + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_items_from_nonexistent_list(self, mock_rest_client, mock_msal_app): + """Test getting items from a list that doesn't exist""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList", + "list": {"template": "genericList"}, + } + ] + } + mock_response.raise_for_status = Mock() + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + with pytest.raises(ValueError, match="List with title 'Nonexistent List' not found"): + client.get_items_from_list("Nonexistent List") + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_files_from_path(self, mock_rest_client, mock_msal_app): + """Test getting files from a folder path""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "name": "report_2024.csv", + "file": {}, + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + }, + { + "name": "report_2023.csv", + "file": {}, + "lastModifiedDateTime": "2023-01-01T00:00:00Z", + }, + { + "name": "subfolder", + "folder": {}, + }, + ] + } + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + files = client.get_files_from_path("Documents", "report", pattern=r".*2024.*") + + assert len(files) == 1 + assert files[0]["name"] == "report_2024.csv" + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_file_bytes_io(self, mock_rest_client, mock_msal_app): + """Test downloading a file to BytesIO""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b"test file content" + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + file_item = { + "name": "test.csv", + "@microsoft.graph.downloadUrl": "https://test.sharepoint.com/download/test.csv", + } + bytes_io = client.get_file_bytes_io(file_item) + + assert isinstance(bytes_io, BytesIO) + assert bytes_io.getvalue() == b"test file content" + + +class TestSharepointListSource: + """Test sharepoint_list source""" + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_list_source(self, mock_client_class): + """Test sharepoint_list source yields data correctly""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_items_from_list.return_value = [ + {"Title": "Item 1", "Field1": "Value1"}, + {"Title": "Item 2", "Field1": "Value2"}, + ] + mock_client_class.return_value = mock_client_instance + + # Create config + config = SharepointListConfig( + table_name="test_table", + list_title="Test List", + select="Title,Field1", + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_list(config, credentials=credentials) + + # Extract resources from source + resources = list(source.resources.values()) + assert len(resources) == 1 + + # Get data from resource + resource_data = list(resources[0]) + assert len(resource_data) == 2 + assert resource_data[0]["Title"] == "Item 1" + assert resource_data[1]["Title"] == "Item 2" +class TestSharepointFilesSource: + """Test sharepoint_files source""" + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_csv(self, mock_client_class): + """Test sharepoint_files source with CSV files""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "report.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/report.csv", + } + ] + + # Create test CSV data + csv_data = b"col1,col2\nval1,val2\nval3,val4" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="report", + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract resources from source + resources = list(source.resources.values()) + assert len(resources) == 1 + + # Get data from resource - this should yield dataframes + all_data = list(resources[0]) + + # The transformer should yield dataframes + assert len(all_data) > 0 + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_incremental(self, mock_client_class): + """Test sharepoint_files source with incremental loading""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "old_file.csv", + "lastModifiedDateTime": "2020-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/old_file.csv", + }, + { + "name": "new_file.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/new_file.csv", + }, + ] + + csv_data = b"col1,col2\nval1,val2" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config with incremental loading + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="file", + is_file_incremental=True, + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract resources from source + resources = list(source.resources.values()) + assert len(resources) == 1 + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_with_chunks(self, mock_client_class): + """Test sharepoint_files source with chunked reading""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "large_file.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/large_file.csv", + } + ] + + # Create larger CSV data + csv_data = b"col1,col2\n" + b"\n".join([b"val1,val2" for _ in range(100)]) + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config with chunksize + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="large", + pandas_kwargs={"chunksize": 10}, + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract resources from source + resources = list(source.resources.values()) + assert len(resources) == 1 + + # Get data from resource - with chunksize, this yields multiple dataframes (chunks) + all_chunks = list(resources[0]) + # Should have 10 chunks (100 rows / 10 rows per chunk) + assert len(all_chunks) == 10 + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_sharepoint_list_pipeline(destination_name: str) -> None: + """Integration test for sharepoint_list pipeline""" + + with patch("sources.sharepoint.SharepointClient") as mock_client_class: + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"} + mock_client_instance.get_items_from_list.return_value = [ + {"Title": "Item 1", "Description": "Description 1", "Status": "Active"}, + {"Title": "Item 2", "Description": "Description 2", "Status": "Completed"}, + {"Title": "Item 3", "Description": "Description 3", "Status": "Active"}, + ] + mock_client_class.return_value = mock_client_instance + + # Create pipeline + pipeline = dlt.pipeline( + pipeline_name="test_sharepoint_list", + destination=destination_name, + dataset_name="sharepoint_list_test", + dev_mode=True, + ) + + # Create config + config = SharepointListConfig( + table_name="test_items", + list_title="Test List", + ) + + # Create source and run pipeline + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_list(config, credentials=credentials) + load_info = pipeline.run(source) + + # Assert load info + assert_load_info(load_info) + + # Check table counts + table_counts = load_table_counts(pipeline, "test_items") + assert table_counts["test_items"] == 3 + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_sharepoint_files_pipeline(destination_name: str) -> None: + """Integration test for sharepoint_files pipeline""" + + with patch("sources.sharepoint.SharepointClient") as mock_client_class: + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "data.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/data.csv", + } + ] + + # Create test CSV data + csv_data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,Chicago" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create pipeline + pipeline = dlt.pipeline( + pipeline_name="test_sharepoint_files", + destination=destination_name, + dataset_name="sharepoint_files_test", + dev_mode=True, + ) + + # Create config + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_data", + file_name_startswith="data", + ) + + # Create source and run pipeline + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + load_info = pipeline.run(source) + + # Assert load info + assert_load_info(load_info) + + # Check table counts + table_counts = load_table_counts(pipeline, "test_data") + assert table_counts["test_data"] == 3