From 89307f0bc98c2e9cfadc0d0b4be697b3093c9906 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:33:57 +0200 Subject: [PATCH 1/9] feat: Implement SharePoint data extraction sources and configuration classes --- sources/sharepoint/__init__.py | 106 ++++++++ sources/sharepoint/helpers.py | 245 ++++++++++++++++++ sources/sharepoint/sharepoint_files_config.py | 76 ++++++ sources/sharepoint_pipeline.py | 62 +++++ 4 files changed, 489 insertions(+) create mode 100644 sources/sharepoint/__init__.py create mode 100644 sources/sharepoint/helpers.py create mode 100644 sources/sharepoint/sharepoint_files_config.py create mode 100644 sources/sharepoint_pipeline.py diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py new file mode 100644 index 000000000..4d53d983f --- /dev/null +++ b/sources/sharepoint/__init__.py @@ -0,0 +1,106 @@ +from typing import Iterator, Dict +import re + +import dlt +from dlt.common.typing import TDataItems +from dlt.common.configuration.specs import configspec, BaseConfiguration +from loguru import logger +import pandas as pd + +from .helpers import SharepointClient +from .sharepoint_files_config import SharepointFilesConfig, SharepointListConfig + + +@configspec +class SharepointCredentials(BaseConfiguration): + client_id: str = None + tenant_id: str = None + site_id: str = None + client_secret: str = None + sub_site_id: str = "" + + +@dlt.source(name="sharepoint_list", max_table_nesting=0) +def sharepoint_list( + sharepoint_list_config: SharepointListConfig, + credentials: SharepointCredentials = dlt.secrets.value, +) -> Iterator[Dict[str, str]]: + client: SharepointClient = SharepointClient(**credentials) + client.connect() + logger.info(f"Connected to SharePoint site: {client.site_info}") + + def get_pipe(sharepoint_list_config: SharepointListConfig): + def get_records(sharepoint_list_config: SharepointListConfig): + data = client.get_items_from_list(list_title=sharepoint_list_config.list_title, select=sharepoint_list_config.select) + yield from data + return dlt.resource(get_records, name=sharepoint_list_config.table_name)(sharepoint_list_config) + yield get_pipe(sharepoint_list_config=sharepoint_list_config) + + +@dlt.source(name="sharepoint_files", max_table_nesting=0) +def sharepoint_files( + sharepoint_files_config: SharepointFilesConfig, + credentials: SharepointCredentials = dlt.secrets.value, +): + client: SharepointClient = SharepointClient(**credentials) + client.connect() + logger.info(f"Connected to SharePoint site: {client.site_info}") + + def get_files( + sharepoint_files_config: SharepointFilesConfig, + last_update_timestamp: dlt.sources.incremental = dlt.sources.incremental( + cursor_path="lastModifiedDateTime", + initial_value="2020-01-01T00:00:00Z", + primary_key=(), + ), + ): + current_last_value = last_update_timestamp.last_value + logger.debug(f"current_last_value: {current_last_value}") + for file_item in client.get_files_from_path( + folder_path=sharepoint_files_config.folder_path, + file_name_startswith=sharepoint_files_config.file_name_startswith, + pattern=sharepoint_files_config.pattern, + ): + if file_item["size"] > sharepoint_files_config.file_size_limit: + logger.warning(f"File {file_item['name']} is too large, skipping") + raise RuntimeError( + f"File {file_item['name']} is larger than the limit of" + f" {sharepoint_files_config.file_size_limit} bytes." + ) + logger.debug( + "filtering files based on lastModifiedDateTime, compare to last_value:" + f" {current_last_value}" + ) + if file_item["lastModifiedDateTime"] > current_last_value or not sharepoint_files_config.is_file_incremental: + logger.info( + f"Processing file after lastModifiedDateTime filter: {file_item['name']}" + ) + + file_item["pd_function"] = sharepoint_files_config.file_type.get_pd_function() + file_item["pd_kwargs"] = sharepoint_files_config.pandas_kwargs + yield file_item + else: + logger.info( + f"Skipping file {file_item['name']} based on lastModifiedDateTime filter" + ) + + def get_records(file_item: Dict) -> TDataItems: + chunksize = file_item["pd_kwargs"].get("chunksize", None) + file_io = client.get_file_bytes_io(file_item=file_item) + + if chunksize: + with file_item["pd_function"](file_io, **file_item["pd_kwargs"]) as reader: + for num, chunk in enumerate(reader): + logger.info(f"Processing chunk {num} of {file_item['name']}") + yield chunk + else: + df = file_item["pd_function"](file_io, **file_item["pd_kwargs"]) + yield df + logger.debug(f"get_records done for {file_item['name']}") + + def get_pipe(sharepoint_files_config: SharepointFilesConfig): + return dlt.resource(get_files, name=f"{sharepoint_files_config.table_name}_files")(sharepoint_files_config) | dlt.transformer( + get_records, name=sharepoint_files_config.table_name, parallelized=False + ) + + yield get_pipe(sharepoint_files_config=sharepoint_files_config) diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py new file mode 100644 index 000000000..7e534ba69 --- /dev/null +++ b/sources/sharepoint/helpers.py @@ -0,0 +1,245 @@ +from typing import Dict, Union, List, Tuple +from io import BytesIO +import re + +from msal import ConfidentialClientApplication +from loguru import logger +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator + + +class SharepointClient: + # * playground: https://developer.microsoft.com/en-us/graph/graph-explorer + # * If the result contains more results, Microsoft Graph returns an @odata.nextLink property + + def __init__( + self, + client_id: str, + tenant_id: str, + site_id: str, + client_secret: str, + sub_site_id: str = "", + ) -> None: + self.client_id = client_id + self.tenant_id = tenant_id + self.client_secret = client_secret + self.sub_site_id = sub_site_id + self.site_id = site_id + if not all([self.client_id, self.tenant_id, self.client_secret, self.site_id]): + raise ValueError( + "client_id, tenant_id, client_secret and site_id are required for connect to" + " SharePoint" + ) + self.graph_api_url = "https://graph.microsoft.com/v1.0/sites" + self.graph_site_url = f"{self.graph_api_url}/{self.site_id}" + if self.sub_site_id: + self.graph_site_url += f"/sites/{self.sub_site_id}" + + def connect(self) -> None: + authority = f"https://login.microsoftonline.com/{self.tenant_id}" + scope = ["https://graph.microsoft.com/.default"] + + app = ConfidentialClientApplication( + self.client_id, + authority=authority, + client_credential=self.client_secret, + ) + + # Get the access token + token_response = app.acquire_token_for_client(scopes=scope) + access_token = token_response.get("access_token", None) + + if access_token: + self.client = RESTClient( + base_url=self.graph_site_url, + auth=BearerTokenAuth(access_token), + paginator=JSONLinkPaginator(next_url_path="@odata.nextLink"), + ) + logger.success(f"Connected to SharePoint site id: {self.site_id} successfully") + else: + raise ConnectionError("Connection failed : ", token_response) + + @property + def sub_sites(self) -> List: + url = f"{self.graph_site_url}/sites" + response = self.client.get(url) + site_info = response.json() + if "value" in site_info: + return site_info["value"] + else: + logger.warning(f"No subsite found in {url}") + + @property + def site_info(self) -> Dict: + url = f"{self.graph_site_url}" + response = self.client.get(url) + site_info = response.json() + if not "error" in site_info: + return site_info + else: + logger.warning(f"No site_info found in {url}") + + def get_all_lists_in_site(self) -> List[Dict]: + url = f"{self.graph_site_url}/lists" + res = self.client.get(url) + res.raise_for_status() + lists_info = res.json() + if "value" in lists_info: + all_items = lists_info["value"] + filtered_lists = [ + item for item in all_items + if item.get("list", {}).get("template") == "genericList" + and "Lists" in item.get("webUrl", "") + ] + return filtered_lists + else: + filtered_lists = [] + if not filtered_lists: + logger.warning(f"No lists found in {url}") + return filtered_lists + + def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]: + #TODO, pagination not yet implemented + logger.warning( + "Pagination is not implemented for get_items_from_list, " + "it will return only first page of items." + ) + all_lists = self.get_all_lists_in_site() + filtered_lists = [ + x for x in all_lists + if x.get("list", {}).get("template") == "genericList" + and "Lists" in x.get("webUrl", "") + ] + + possible_list_titles = [x["displayName"] for x in filtered_lists] + if list_title not in possible_list_titles: + raise ValueError( + f"List with title '{list_title}' not found in site {self.site_id}. " + f"Available lists: {possible_list_titles}" + ) + + # Get the list ID + list_id = next( + x["id"] for x in filtered_lists if x["displayName"] == list_title + ) + + url = f"{self.graph_site_url}/lists/{list_id}/items?expand=fields" + if select: + url += f"(select={select})" + res = self.client.get(url) + res.raise_for_status() + items_info = res.json() + + if "value" in items_info: + output = [x.get("fields", {}) for x in items_info["value"]] + else: + output = [] + if output: + logger.info(f"Got {len(output)} items from list: {list_title}") + return output + else: + logger.warning(f"No items found in list: {list_title}, with select: {select}") + + def get_files_from_path( + self, folder_path: str, file_name_startswith: str, pattern: str = None + ) -> Dict: + folder_url = ( + f"{self.graph_site_url}/drive/root:/{folder_path}:/children?$filter=startswith(name," + f" '{file_name_startswith}')" + ) + logger.debug(f"Getting files from folder with endpoint: {folder_url}") + res = self.client.get(folder_url) + file_and_folder_items = res.json().get("value", []) + file_items = [x for x in file_and_folder_items if "file" in x.keys()] + if pattern: + logger.debug(f"Filtering files with pattern: {pattern}") + file_items = [x for x in file_items if re.search(pattern, x["name"])] + + logger.debug(f"Got number files from ms graph api: {len(file_items)}") + return file_items + + def get_file_bytes_io(self, file_item: Dict): + file_url = file_item["@microsoft.graph.downloadUrl"] + response = self.client.get(file_url) + if response.status_code == 200: + bytes_io = BytesIO(response.content) + logger.info( + f"File {file_item['name']} downloaded to BytesIO, size: {len(bytes_io.getvalue())}" + ) + return bytes_io + else: + raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded") + + def archive_file(self, file_item: Dict, archive_folder_path: str, new_file_name: str) -> None: + url = f"{self.graph_site_url}/drive/items/{file_item['id']}" + archive_folder_path = self.remove_driver_root_in_path(archive_folder_path) + archive_folder_id = self.create_folder_if_not_exists(folder_path=archive_folder_path) + body = { + "parentReference": {"id": archive_folder_id}, + "name": new_file_name, + } + res = self.client.patch(url, json=body) + if res.status_code == 200: + logger.success( + f"File {file_item['name']} renamed to {new_file_name} in {archive_folder_path}" + ) + else: + raise RuntimeError(f"File {file_item['name']} can't be renamed to {new_file_name}") + + def safe_get_folder_id(self, folder_path: str) -> Union[str, None]: + folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}" + res = self.client.get(folder_url) + if res.status_code == 200: + return res.json()["id"] + + def list_folder(self, folder_path: str) -> Tuple[List, List]: + """List sub folders and files in folder_path + + Args: + folder_path (str): folder_path from sharepoint + + Returns: + Tuple[List, List]: (List of folders, List of files) + """ + if r"/" not in folder_path: + raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'") + folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}:/children" + logger.info(f"Listing from folder_path: {folder_path} using {folder_url}") + res = self.client.get(folder_url) + file_and_folder_items = res.json().get("value", []) + file_items = [x for x in file_and_folder_items if "file" in x.keys()] + folder_items = [x for x in file_and_folder_items if "folder" in x.keys()] + return (folder_items, file_items) + + def create_folder(self, folder_path: str) -> str: + if r"/" not in folder_path: + raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'") + parent_folder, folder_name = folder_path.rsplit("/", 1) + parent_folder_id = self.safe_get_folder_id(parent_folder) + if not parent_folder_id: + raise ValueError(f"Parent folder {parent_folder} not found") + logger.debug(f"Creating folder {folder_name} in {parent_folder}") + folder_url = f"{self.graph_site_url}/drive/items/{parent_folder_id}/children" + body = { + "name": folder_name, + "folder": {}, + "@microsoft.graph.conflictBehavior": "fail", + } + res = self.client.post(folder_url, json=body) + if res.status_code == 201: + logger.success(f"Folder {folder_name} created") + return res.json()["id"] + else: + raise RuntimeError(f"Folder {folder_name} can't be created") + + def create_folder_if_not_exists(self, folder_path: str) -> str: + folder_id = self.safe_get_folder_id(folder_path) + if folder_id: + logger.info(f"Folder {folder_path} already exists") + return folder_id + else: + return self.create_folder(folder_path) + + def remove_driver_root_in_path(self, path: str) -> str: + return re.sub(r"^/drive/root:/", "", path) diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py new file mode 100644 index 000000000..fd9c5ebd6 --- /dev/null +++ b/sources/sharepoint/sharepoint_files_config.py @@ -0,0 +1,76 @@ +from typing import Iterator, Optional, Sequence, List, Dict +import re +from enum import Enum + +from loguru import logger +import pandas as pd +from pydantic import BaseModel + + + +class FileType(Enum): + EXCEL = "excel" + CSV = "csv" + JSON = "json" + PARQUET = "parquet" + SAS = "sas" + SPSS = "spss" + SAV = "sav" + + def get_pd_function(self): + return { + self.EXCEL: pd.read_excel, + self.CSV: pd.read_csv, + self.JSON: pd.read_json, + self.PARQUET: pd.read_parquet, + self.SAS: pd.read_sas, + self.SPSS: pd.read_spss, + }[self] + + +class SharepointListConfig(BaseModel): + table_name: str + list_title: str + select: Optional[str] = None + limit: Optional[int] = None + is_incremental: Optional[bool] = False + + def __init__(self, **data): + super().__init__(**data) + if self.is_incremental is True: + raise NotImplementedError( + "Incremental loading for Sharepoint List is not implemented yet." + ) + +class SharepointFilesConfig(BaseModel): + file_type: FileType + folder_path: str + table_name: str + file_name_startswith: str + pattern: Optional[str] = ".*" + pandas_kwargs: Dict = {} + limit: Optional[int] = None + file_size_limit: Optional[int] = 100_000_000 # 100 MB + is_compressed_folder: Optional[bool] = False + if_apply_str_to_all_columns: Optional[bool] = True + is_file_incremental: bool = False + + def __init__(self, **data): + super().__init__(**data) + self.folder_path = validate_folder_path(self.folder_path) + self.pattern = f"^{self.file_name_startswith}{self.pattern}" + + +def validate_folder_path(folder_path: str) -> str: + if folder_path.startswith("/"): + folder_path = folder_path[1:] + if folder_path.endswith("/"): + folder_path = folder_path[:-1] + if not re.compile(r"^[a-zA-Z0-9_\-/\s\.]*$").match(folder_path): + raise ValueError( + "Invalid folder path, only alphanumeric characters, dashes and underscores are" + f" allowed: {folder_path}" + ) + if re.compile(r"//").search(folder_path): + raise ValueError(f"Invalid folder path with double slashes: {folder_path}") + return folder_path diff --git a/sources/sharepoint_pipeline.py b/sources/sharepoint_pipeline.py new file mode 100644 index 000000000..57b83b1ef --- /dev/null +++ b/sources/sharepoint_pipeline.py @@ -0,0 +1,62 @@ + +import dlt +from sharepoint import sharepoint_list, sharepoint_files, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointFilesConfig, SharepointListConfig + +if __name__ == "__main__": + # --- 1. Define SharePoint credentials --- + credentials = SharepointCredentials( + client_id="your-client-id", + tenant_id="your-tenant-id", + site_id="your-site-id", + client_secret="your-client-secret", + sub_site_id="" + ) + + # --- 2. Configure SharePoint list extraction --- + list_config = SharepointListConfig( + list_title="test_list", + select="Title,ins", + table_name="sharepoint_list_table" + ) + + # --- 3. Configure SharePoint file extraction --- + files_config = SharepointFilesConfig( + folder_path="General/sharepoint_test", + file_name_startswith="test_", + pattern=r".*\.csv$", + file_type="csv", + table_name="sharepoint_reports", + is_file_incremental=True, + file_size_limit=5_000_000, + pandas_kwargs={} + ) + + # --- 4. Create the DLT pipeline (destination = DuckDB) --- + pipeline = dlt.pipeline( + pipeline_name="sharepoint_to_duckdb", + destination="duckdb", + dataset_name="sharepoint_data", + full_refresh=False + ) + + # --- 5. Run both sources and load to DuckDB --- + print("Loading SharePoint List data...") + list_load_info = pipeline.run( + sharepoint_list(sharepoint_list_config=list_config, credentials=credentials) + ) + print(list_load_info) + with pipeline.sql_client() as client: + df = client.execute("SELECT * FROM sharepoint_list_table LIMIT 10").df() + print(df) + + + print("Loading SharePoint Files data...") + files_load_info = pipeline.run( + sharepoint_files(sharepoint_files_config=files_config, credentials=credentials) + ) + print(files_load_info) + + with pipeline.sql_client() as client: + df = client.execute("SELECT * FROM sharepoint_reports LIMIT 10").df() + print(df) From d9fe95cd9f167c302e01e071c1afa0db9ca7b2a1 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:36:56 +0200 Subject: [PATCH 2/9] Fix: Remove file size limit from SharePoint file configuration and extraction --- sources/sharepoint/__init__.py | 6 ------ sources/sharepoint/sharepoint_files_config.py | 1 - sources/sharepoint_pipeline.py | 1 - 3 files changed, 8 deletions(-) diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py index 4d53d983f..1740e1c70 100644 --- a/sources/sharepoint/__init__.py +++ b/sources/sharepoint/__init__.py @@ -61,12 +61,6 @@ def get_files( file_name_startswith=sharepoint_files_config.file_name_startswith, pattern=sharepoint_files_config.pattern, ): - if file_item["size"] > sharepoint_files_config.file_size_limit: - logger.warning(f"File {file_item['name']} is too large, skipping") - raise RuntimeError( - f"File {file_item['name']} is larger than the limit of" - f" {sharepoint_files_config.file_size_limit} bytes." - ) logger.debug( "filtering files based on lastModifiedDateTime, compare to last_value:" f" {current_last_value}" diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py index fd9c5ebd6..0e748c2c8 100644 --- a/sources/sharepoint/sharepoint_files_config.py +++ b/sources/sharepoint/sharepoint_files_config.py @@ -50,7 +50,6 @@ class SharepointFilesConfig(BaseModel): pattern: Optional[str] = ".*" pandas_kwargs: Dict = {} limit: Optional[int] = None - file_size_limit: Optional[int] = 100_000_000 # 100 MB is_compressed_folder: Optional[bool] = False if_apply_str_to_all_columns: Optional[bool] = True is_file_incremental: bool = False diff --git a/sources/sharepoint_pipeline.py b/sources/sharepoint_pipeline.py index 57b83b1ef..0e63653ff 100644 --- a/sources/sharepoint_pipeline.py +++ b/sources/sharepoint_pipeline.py @@ -28,7 +28,6 @@ file_type="csv", table_name="sharepoint_reports", is_file_incremental=True, - file_size_limit=5_000_000, pandas_kwargs={} ) From dc6e8f5ce24d92d326b917dcba1f119d03663139 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:39:31 +0200 Subject: [PATCH 3/9] Refactor: Replace loguru logger with dlt.common logger in SharePoint modules --- sources/sharepoint/__init__.py | 2 +- sources/sharepoint/helpers.py | 2 +- sources/sharepoint/sharepoint_files_config.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py index 1740e1c70..3d5670b50 100644 --- a/sources/sharepoint/__init__.py +++ b/sources/sharepoint/__init__.py @@ -4,7 +4,7 @@ import dlt from dlt.common.typing import TDataItems from dlt.common.configuration.specs import configspec, BaseConfiguration -from loguru import logger +from dlt.common import logger import pandas as pd from .helpers import SharepointClient diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py index 7e534ba69..3ccf19fbe 100644 --- a/sources/sharepoint/helpers.py +++ b/sources/sharepoint/helpers.py @@ -3,7 +3,7 @@ import re from msal import ConfidentialClientApplication -from loguru import logger +from dlt.common import logger from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.auth import BearerTokenAuth from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py index 0e748c2c8..1daf69254 100644 --- a/sources/sharepoint/sharepoint_files_config.py +++ b/sources/sharepoint/sharepoint_files_config.py @@ -1,13 +1,11 @@ -from typing import Iterator, Optional, Sequence, List, Dict +from typing import Optional, Dict import re from enum import Enum -from loguru import logger import pandas as pd from pydantic import BaseModel - class FileType(Enum): EXCEL = "excel" CSV = "csv" From ad83f5709288c935c194be422e43b16bfd544b03 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:44:28 +0200 Subject: [PATCH 4/9] Refactor: Remove unused methods and imports from SharepointClient class --- sources/sharepoint/helpers.py | 75 +---------------------------------- 1 file changed, 1 insertion(+), 74 deletions(-) diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py index 3ccf19fbe..e88e42f25 100644 --- a/sources/sharepoint/helpers.py +++ b/sources/sharepoint/helpers.py @@ -1,4 +1,4 @@ -from typing import Dict, Union, List, Tuple +from typing import Dict, List from io import BytesIO import re @@ -170,76 +170,3 @@ def get_file_bytes_io(self, file_item: Dict): return bytes_io else: raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded") - - def archive_file(self, file_item: Dict, archive_folder_path: str, new_file_name: str) -> None: - url = f"{self.graph_site_url}/drive/items/{file_item['id']}" - archive_folder_path = self.remove_driver_root_in_path(archive_folder_path) - archive_folder_id = self.create_folder_if_not_exists(folder_path=archive_folder_path) - body = { - "parentReference": {"id": archive_folder_id}, - "name": new_file_name, - } - res = self.client.patch(url, json=body) - if res.status_code == 200: - logger.success( - f"File {file_item['name']} renamed to {new_file_name} in {archive_folder_path}" - ) - else: - raise RuntimeError(f"File {file_item['name']} can't be renamed to {new_file_name}") - - def safe_get_folder_id(self, folder_path: str) -> Union[str, None]: - folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}" - res = self.client.get(folder_url) - if res.status_code == 200: - return res.json()["id"] - - def list_folder(self, folder_path: str) -> Tuple[List, List]: - """List sub folders and files in folder_path - - Args: - folder_path (str): folder_path from sharepoint - - Returns: - Tuple[List, List]: (List of folders, List of files) - """ - if r"/" not in folder_path: - raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'") - folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}:/children" - logger.info(f"Listing from folder_path: {folder_path} using {folder_url}") - res = self.client.get(folder_url) - file_and_folder_items = res.json().get("value", []) - file_items = [x for x in file_and_folder_items if "file" in x.keys()] - folder_items = [x for x in file_and_folder_items if "folder" in x.keys()] - return (folder_items, file_items) - - def create_folder(self, folder_path: str) -> str: - if r"/" not in folder_path: - raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'") - parent_folder, folder_name = folder_path.rsplit("/", 1) - parent_folder_id = self.safe_get_folder_id(parent_folder) - if not parent_folder_id: - raise ValueError(f"Parent folder {parent_folder} not found") - logger.debug(f"Creating folder {folder_name} in {parent_folder}") - folder_url = f"{self.graph_site_url}/drive/items/{parent_folder_id}/children" - body = { - "name": folder_name, - "folder": {}, - "@microsoft.graph.conflictBehavior": "fail", - } - res = self.client.post(folder_url, json=body) - if res.status_code == 201: - logger.success(f"Folder {folder_name} created") - return res.json()["id"] - else: - raise RuntimeError(f"Folder {folder_name} can't be created") - - def create_folder_if_not_exists(self, folder_path: str) -> str: - folder_id = self.safe_get_folder_id(folder_path) - if folder_id: - logger.info(f"Folder {folder_path} already exists") - return folder_id - else: - return self.create_folder(folder_path) - - def remove_driver_root_in_path(self, path: str) -> str: - return re.sub(r"^/drive/root:/", "", path) From c94cb2f888af479f27812d683f6685a0d643fe5d Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:52:33 +0200 Subject: [PATCH 5/9] fix: Change log level from success to info for SharePoint connection message --- sources/sharepoint/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py index e88e42f25..852e71a19 100644 --- a/sources/sharepoint/helpers.py +++ b/sources/sharepoint/helpers.py @@ -56,7 +56,7 @@ def connect(self) -> None: auth=BearerTokenAuth(access_token), paginator=JSONLinkPaginator(next_url_path="@odata.nextLink"), ) - logger.success(f"Connected to SharePoint site id: {self.site_id} successfully") + logger.info(f"Connected to SharePoint site id: {self.site_id} successfully") else: raise ConnectionError("Connection failed : ", token_response) From 07249844824307bab82703e8bdef072c49009ddd Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 15:54:52 +0200 Subject: [PATCH 6/9] refactor: Improve code readability by formatting and organizing function definitions in SharePoint modules --- sources/sharepoint/__init__.py | 24 +++++++++++++++---- sources/sharepoint/helpers.py | 18 +++++++++----- sources/sharepoint/sharepoint_files_config.py | 1 + 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py index 3d5670b50..bb684add8 100644 --- a/sources/sharepoint/__init__.py +++ b/sources/sharepoint/__init__.py @@ -31,9 +31,16 @@ def sharepoint_list( def get_pipe(sharepoint_list_config: SharepointListConfig): def get_records(sharepoint_list_config: SharepointListConfig): - data = client.get_items_from_list(list_title=sharepoint_list_config.list_title, select=sharepoint_list_config.select) + data = client.get_items_from_list( + list_title=sharepoint_list_config.list_title, + select=sharepoint_list_config.select, + ) yield from data - return dlt.resource(get_records, name=sharepoint_list_config.table_name)(sharepoint_list_config) + + return dlt.resource(get_records, name=sharepoint_list_config.table_name)( + sharepoint_list_config + ) + yield get_pipe(sharepoint_list_config=sharepoint_list_config) @@ -65,12 +72,17 @@ def get_files( "filtering files based on lastModifiedDateTime, compare to last_value:" f" {current_last_value}" ) - if file_item["lastModifiedDateTime"] > current_last_value or not sharepoint_files_config.is_file_incremental: + if ( + file_item["lastModifiedDateTime"] > current_last_value + or not sharepoint_files_config.is_file_incremental + ): logger.info( f"Processing file after lastModifiedDateTime filter: {file_item['name']}" ) - file_item["pd_function"] = sharepoint_files_config.file_type.get_pd_function() + file_item["pd_function"] = ( + sharepoint_files_config.file_type.get_pd_function() + ) file_item["pd_kwargs"] = sharepoint_files_config.pandas_kwargs yield file_item else: @@ -93,7 +105,9 @@ def get_records(file_item: Dict) -> TDataItems: logger.debug(f"get_records done for {file_item['name']}") def get_pipe(sharepoint_files_config: SharepointFilesConfig): - return dlt.resource(get_files, name=f"{sharepoint_files_config.table_name}_files")(sharepoint_files_config) | dlt.transformer( + return dlt.resource( + get_files, name=f"{sharepoint_files_config.table_name}_files" + )(sharepoint_files_config) | dlt.transformer( get_records, name=sharepoint_files_config.table_name, parallelized=False ) diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py index 852e71a19..e634bc374 100644 --- a/sources/sharepoint/helpers.py +++ b/sources/sharepoint/helpers.py @@ -88,7 +88,8 @@ def get_all_lists_in_site(self) -> List[Dict]: if "value" in lists_info: all_items = lists_info["value"] filtered_lists = [ - item for item in all_items + item + for item in all_items if item.get("list", {}).get("template") == "genericList" and "Lists" in item.get("webUrl", "") ] @@ -99,15 +100,16 @@ def get_all_lists_in_site(self) -> List[Dict]: logger.warning(f"No lists found in {url}") return filtered_lists - def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]: - #TODO, pagination not yet implemented + def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]: + # TODO, pagination not yet implemented logger.warning( "Pagination is not implemented for get_items_from_list, " "it will return only first page of items." ) all_lists = self.get_all_lists_in_site() filtered_lists = [ - x for x in all_lists + x + for x in all_lists if x.get("list", {}).get("template") == "genericList" and "Lists" in x.get("webUrl", "") ] @@ -139,7 +141,9 @@ def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]: logger.info(f"Got {len(output)} items from list: {list_title}") return output else: - logger.warning(f"No items found in list: {list_title}, with select: {select}") + logger.warning( + f"No items found in list: {list_title}, with select: {select}" + ) def get_files_from_path( self, folder_path: str, file_name_startswith: str, pattern: str = None @@ -169,4 +173,6 @@ def get_file_bytes_io(self, file_item: Dict): ) return bytes_io else: - raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded") + raise FileNotFoundError( + f"File not found: {file_item['name']} or can't be downloaded" + ) diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py index 1daf69254..1a8e4d8a1 100644 --- a/sources/sharepoint/sharepoint_files_config.py +++ b/sources/sharepoint/sharepoint_files_config.py @@ -40,6 +40,7 @@ def __init__(self, **data): "Incremental loading for Sharepoint List is not implemented yet." ) + class SharepointFilesConfig(BaseModel): file_type: FileType folder_path: str From 34776d1b62ab49e0d710e31432033e524bf65a78 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Thu, 23 Oct 2025 16:44:41 +0200 Subject: [PATCH 7/9] fix: Remove unused attributes from SharepointListConfig and SharepointFilesConfig classes --- sources/sharepoint/sharepoint_files_config.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py index 1a8e4d8a1..e1bc5a821 100644 --- a/sources/sharepoint/sharepoint_files_config.py +++ b/sources/sharepoint/sharepoint_files_config.py @@ -30,7 +30,6 @@ class SharepointListConfig(BaseModel): table_name: str list_title: str select: Optional[str] = None - limit: Optional[int] = None is_incremental: Optional[bool] = False def __init__(self, **data): @@ -48,9 +47,6 @@ class SharepointFilesConfig(BaseModel): file_name_startswith: str pattern: Optional[str] = ".*" pandas_kwargs: Dict = {} - limit: Optional[int] = None - is_compressed_folder: Optional[bool] = False - if_apply_str_to_all_columns: Optional[bool] = True is_file_incremental: bool = False def __init__(self, **data): From 933becdb09bdd73b07fe339d50e64b15cbc2ce39 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Mon, 8 Dec 2025 15:19:55 +0100 Subject: [PATCH 8/9] feat: Add SharePoint source with list and file extraction capabilities - Updated `pyproject.toml` to include SharePoint dependencies. - Created `README.md` for SharePoint source documentation. - Implemented SharePoint client in `helpers.py` for API interaction. - Added configuration classes in `sharepoint_files_config.py` for lists and files. - Developed extraction functions in `__init__.py` for SharePoint lists and files. - Created unit tests for SharePoint source in `test_sharepoint_source.py`. - Added requirements file for SharePoint source dependencies. --- pyproject.toml | 5 + sources/sharepoint/README.md | 256 +++++++ sources/sharepoint/__init__.py | 58 ++ sources/sharepoint/helpers.py | 91 ++- sources/sharepoint/requirements.txt | 3 + sources/sharepoint/sharepoint_files_config.py | 63 ++ tests/sharepoint/__init__.py | 0 tests/sharepoint/test_sharepoint_source.py | 651 ++++++++++++++++++ 8 files changed, 1125 insertions(+), 2 deletions(-) create mode 100644 sources/sharepoint/README.md create mode 100644 sources/sharepoint/requirements.txt create mode 100644 tests/sharepoint/__init__.py create mode 100644 tests/sharepoint/test_sharepoint_source.py diff --git a/pyproject.toml b/pyproject.toml index 5be781448..e7025cd35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,10 @@ scrapy = [ "scrapy>=2.11.0,<3", "twisted==22.10.0", ] +sharepoint = [ + "msal>=1.20.0", + "pandas>=2.0.0", +] [tool.uv] default-groups = [ @@ -113,6 +117,7 @@ default-groups = [ "airtable", "filesystem", "scrapy", + "sharepoint", ] # [tool.uv.sources] diff --git a/sources/sharepoint/README.md b/sources/sharepoint/README.md new file mode 100644 index 000000000..dc2e4afb0 --- /dev/null +++ b/sources/sharepoint/README.md @@ -0,0 +1,256 @@ +# SharePoint Source + +This source allows you to extract data from SharePoint lists and files using the Microsoft Graph API. + +## Features + +- Extract data from SharePoint lists +- Download and process files from SharePoint document libraries +- Support for multiple file formats (CSV, Excel, JSON, Parquet, SAS, SPSS) +- Incremental loading support for files based on modification time +- Flexible file filtering with regex patterns + +## Prerequisites + +Before using this source, you need: + +1. **Azure AD Application Registration** with the following: + - Client ID + - Tenant ID + - Client Secret + - Microsoft Graph API permissions: + - `Sites.Read.All` or `Sites.ReadWrite.All` + - `Files.Read.All` (for file operations) + +2. **SharePoint Site ID**: The unique identifier for your SharePoint site + +## Configuration + +### Credentials + +Configure your credentials in `secrets.toml`: + +```toml +[sources.sharepoint] +client_id = "your-client-id" +tenant_id = "your-tenant-id" +site_id = "your-site-id" +client_secret = "your-client-secret" +sub_site_id = "" # Optional: for sub-sites +``` + +### SharePoint List Configuration + +```python +from sharepoint.sharepoint_files_config import SharepointListConfig + +list_config = SharepointListConfig( + table_name="my_list_data", + list_title="My SharePoint List", + select="Title,Description,Status", # Optional: specific fields + is_incremental=False # Incremental not yet implemented +) +``` + +### SharePoint Files Configuration + +```python +from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType + +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents/Reports", + table_name="reports_data", + file_name_startswith="report_", + pattern=r".*\.csv$", # Optional: regex pattern for filtering + pandas_kwargs={"sep": ","}, # Optional: pandas read options + is_file_incremental=True # Enable incremental loading +) +``` + +## Usage Examples + +### Example 1: Load SharePoint List Data + +```python +import dlt +from sharepoint import sharepoint_list, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointListConfig + +# Configure credentials +credentials = SharepointCredentials() + +# Configure list extraction +list_config = SharepointListConfig( + table_name="tasks", + list_title="Project Tasks" +) + +# Create and run pipeline +pipeline = dlt.pipeline( + pipeline_name="sharepoint_list", + destination="duckdb", + dataset_name="sharepoint_data" +) + +load_info = pipeline.run( + sharepoint_list( + sharepoint_list_config=list_config, + credentials=credentials + ) +) +print(load_info) +``` + +### Example 2: Load Files from SharePoint + +```python +import dlt +from sharepoint import sharepoint_files, SharepointCredentials +from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType + +# Configure credentials +credentials = SharepointCredentials() + +# Configure file extraction +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Shared Documents/Reports", + table_name="monthly_reports", + file_name_startswith="report_", + pattern=r"202[4-5].*\.csv$", + is_file_incremental=True, + pandas_kwargs={"sep": ",", "encoding": "utf-8"} +) + +# Create and run pipeline +pipeline = dlt.pipeline( + pipeline_name="sharepoint_files", + destination="duckdb", + dataset_name="sharepoint_data" +) + +load_info = pipeline.run( + sharepoint_files( + sharepoint_files_config=files_config, + credentials=credentials + ) +) +print(load_info) +``` + +### Example 3: Process Excel Files with Chunking + +```python +files_config = SharepointFilesConfig( + file_type=FileType.EXCEL, + folder_path="Reports/Annual", + table_name="large_report", + file_name_startswith="annual_", + pandas_kwargs={ + "sheet_name": "Data", + "chunksize": 1000 # Process in chunks of 1000 rows + } +) +``` + +## Supported File Types + +The source supports the following file types via pandas: + +- `FileType.CSV` - CSV files +- `FileType.EXCEL` - Excel files (.xlsx, .xls) +- `FileType.JSON` - JSON files +- `FileType.PARQUET` - Parquet files +- `FileType.SAS` - SAS files +- `FileType.SPSS` - SPSS files + +## Incremental Loading + +### File Incremental Loading + +When `is_file_incremental=True`, the source tracks the `lastModifiedDateTime` of files and only processes files that have been modified since the last run. + +```python +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="data", + file_name_startswith="data_", + is_file_incremental=True # Only process new/modified files +) +``` + +### List Incremental Loading + +Incremental loading for SharePoint lists is not yet implemented. + +## Advanced Configuration + +### Folder Path Validation + +Folder paths are automatically normalized: +- Leading/trailing slashes are removed +- Double slashes are not allowed +- Only alphanumeric characters, dashes, underscores, spaces, and dots are allowed + +### Pattern Matching + +The `pattern` parameter is automatically prefixed with `file_name_startswith`. For example: + +```python +files_config = SharepointFilesConfig( + file_name_startswith="report_", + pattern=r"\d{8}\.csv$" +) +# Effective pattern: ^report_\d{8}\.csv$ +``` + +### Pandas Kwargs + +Any pandas read function parameters can be passed via `pandas_kwargs`: + +```python +files_config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="data", + file_name_startswith="data", + pandas_kwargs={ + "sep": ";", + "encoding": "latin1", + "decimal": ",", + "chunksize": 5000 + } +) +``` + +## Troubleshooting + +### Authentication Issues + +If you encounter authentication errors: +1. Verify your Client ID, Tenant ID, and Client Secret are correct +2. Ensure your Azure AD app has the required permissions +3. Check that admin consent has been granted for the permissions + +### File Not Found + +If files are not being found: +1. Verify the folder path is correct (case-sensitive) +2. Check that the file name pattern matches your files +3. Ensure your app has access to the SharePoint site and folder + +### Permission Errors + +Ensure your Azure AD application has been granted: +- `Sites.Read.All` or `Sites.ReadWrite.All` +- `Files.Read.All` + +And that admin consent has been provided for these permissions. + +## Resources + +- [Microsoft Graph API Documentation](https://learn.microsoft.com/en-us/graph/api/overview) +- [SharePoint REST API](https://learn.microsoft.com/en-us/sharepoint/dev/sp-add-ins/get-to-know-the-sharepoint-rest-service) +- [Azure AD App Registration](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py index bb684add8..b61d84fe2 100644 --- a/sources/sharepoint/__init__.py +++ b/sources/sharepoint/__init__.py @@ -1,3 +1,8 @@ +"""SharePoint data source for dlt. + +Provides sources for extracting data from SharePoint lists and files +using the Microsoft Graph API. +""" from typing import Iterator, Dict import re @@ -13,6 +18,15 @@ @configspec class SharepointCredentials(BaseConfiguration): + """Credentials for SharePoint authentication via Azure AD. + + Attributes: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + """ client_id: str = None tenant_id: str = None site_id: str = None @@ -25,6 +39,26 @@ def sharepoint_list( sharepoint_list_config: SharepointListConfig, credentials: SharepointCredentials = dlt.secrets.value, ) -> Iterator[Dict[str, str]]: + """Extract data from a SharePoint list. + + This source connects to SharePoint using Microsoft Graph API and retrieves + items from a specified list. + + Args: + sharepoint_list_config: Configuration for the SharePoint list extraction + credentials: SharePoint authentication credentials + + Yields: + DLT resource containing SharePoint list items + + Example: + >>> config = SharepointListConfig( + ... table_name="tasks", + ... list_title="Project Tasks" + ... ) + >>> source = sharepoint_list(config) + >>> pipeline.run(source) + """ client: SharepointClient = SharepointClient(**credentials) client.connect() logger.info(f"Connected to SharePoint site: {client.site_info}") @@ -49,6 +83,30 @@ def sharepoint_files( sharepoint_files_config: SharepointFilesConfig, credentials: SharepointCredentials = dlt.secrets.value, ): + """Extract and process files from SharePoint document libraries. + + This source downloads files from SharePoint based on the configuration, + processes them using pandas, and yields the data for loading. + + Supports incremental loading based on file modification time. + + Args: + sharepoint_files_config: Configuration for file extraction and processing + credentials: SharePoint authentication credentials + + Yields: + DLT resource containing processed file data + + Example: + >>> config = SharepointFilesConfig( + ... file_type=FileType.CSV, + ... folder_path="Documents/Reports", + ... table_name="reports", + ... file_name_startswith="report_" + ... ) + >>> source = sharepoint_files(config) + >>> pipeline.run(source) + """ client: SharepointClient = SharepointClient(**credentials) client.connect() logger.info(f"Connected to SharePoint site: {client.site_info}") diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py index e634bc374..43aed4ff4 100644 --- a/sources/sharepoint/helpers.py +++ b/sources/sharepoint/helpers.py @@ -1,3 +1,4 @@ +"""Helper module for SharePoint data extraction using Microsoft Graph API.""" from typing import Dict, List from io import BytesIO import re @@ -10,8 +11,21 @@ class SharepointClient: - # * playground: https://developer.microsoft.com/en-us/graph/graph-explorer - # * If the result contains more results, Microsoft Graph returns an @odata.nextLink property + """Client for interacting with SharePoint via Microsoft Graph API. + + This client handles authentication and provides methods to retrieve lists, + list items, and files from SharePoint sites. + + Attributes: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + graph_api_url: Base URL for Microsoft Graph API + graph_site_url: Full URL for the specific SharePoint site + client: REST client instance (set after connect()) + """ def __init__( self, @@ -21,6 +35,18 @@ def __init__( client_secret: str, sub_site_id: str = "", ) -> None: + """Initialize SharePoint client with credentials. + + Args: + client_id: Azure AD application client ID + tenant_id: Azure AD tenant ID + site_id: SharePoint site ID + client_secret: Azure AD application client secret + sub_site_id: Optional sub-site ID for nested sites + + Raises: + ValueError: If any required credentials are missing + """ self.client_id = client_id self.tenant_id = tenant_id self.client_secret = client_secret @@ -37,6 +63,14 @@ def __init__( self.graph_site_url += f"/sites/{self.sub_site_id}" def connect(self) -> None: + """Establish connection to SharePoint using MSAL authentication. + + Acquires an access token using client credentials flow and initializes + the REST client with bearer token authentication. + + Raises: + ConnectionError: If authentication fails or access token cannot be obtained + """ authority = f"https://login.microsoftonline.com/{self.tenant_id}" scope = ["https://graph.microsoft.com/.default"] @@ -62,6 +96,11 @@ def connect(self) -> None: @property def sub_sites(self) -> List: + """Get list of sub-sites within the current SharePoint site. + + Returns: + List of sub-site information dictionaries + """ url = f"{self.graph_site_url}/sites" response = self.client.get(url) site_info = response.json() @@ -72,6 +111,11 @@ def sub_sites(self) -> List: @property def site_info(self) -> Dict: + """Get information about the current SharePoint site. + + Returns: + Dictionary containing site metadata and properties + """ url = f"{self.graph_site_url}" response = self.client.get(url) site_info = response.json() @@ -81,6 +125,14 @@ def site_info(self) -> Dict: logger.warning(f"No site_info found in {url}") def get_all_lists_in_site(self) -> List[Dict]: + """Retrieve all generic lists from the SharePoint site. + + Filters for lists with template type 'genericList' and 'Lists' in their URL, + excluding document libraries and other non-list items. + + Returns: + List of dictionaries containing list metadata + """ url = f"{self.graph_site_url}/lists" res = self.client.get(url) res.raise_for_status() @@ -101,6 +153,20 @@ def get_all_lists_in_site(self) -> List[Dict]: return filtered_lists def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]: + """Retrieve items from a specific SharePoint list. + + Note: Pagination is not yet implemented; only the first page is returned. + + Args: + list_title: Display name of the SharePoint list + select: Optional comma-separated string of field names to retrieve + + Returns: + List of dictionaries containing list item field values + + Raises: + ValueError: If the specified list is not found in the site + """ # TODO, pagination not yet implemented logger.warning( "Pagination is not implemented for get_items_from_list, " @@ -148,6 +214,16 @@ def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict] def get_files_from_path( self, folder_path: str, file_name_startswith: str, pattern: str = None ) -> Dict: + """Get files from a SharePoint folder matching specified criteria. + + Args: + folder_path: Path to the folder within SharePoint (e.g., 'Documents/Reports') + file_name_startswith: Prefix that file names must start with + pattern: Optional regex pattern for additional filtering + + Returns: + List of file item dictionaries containing metadata and download URLs + """ folder_url = ( f"{self.graph_site_url}/drive/root:/{folder_path}:/children?$filter=startswith(name," f" '{file_name_startswith}')" @@ -164,6 +240,17 @@ def get_files_from_path( return file_items def get_file_bytes_io(self, file_item: Dict): + """Download a SharePoint file to a BytesIO object. + + Args: + file_item: File metadata dictionary containing '@microsoft.graph.downloadUrl' + + Returns: + BytesIO object containing the file contents + + Raises: + FileNotFoundError: If the file cannot be downloaded + """ file_url = file_item["@microsoft.graph.downloadUrl"] response = self.client.get(file_url) if response.status_code == 200: diff --git a/sources/sharepoint/requirements.txt b/sources/sharepoint/requirements.txt new file mode 100644 index 000000000..e7fbee22e --- /dev/null +++ b/sources/sharepoint/requirements.txt @@ -0,0 +1,3 @@ +msal>=1.20.0 +pandas>=2.0.0 +dlt>=0.5.1 diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py index e1bc5a821..45ef8934c 100644 --- a/sources/sharepoint/sharepoint_files_config.py +++ b/sources/sharepoint/sharepoint_files_config.py @@ -1,3 +1,8 @@ +"""Configuration classes for SharePoint data extraction. + +Provides configuration models for SharePoint lists and files, +including file type definitions and validation utilities. +""" from typing import Optional, Dict import re from enum import Enum @@ -7,6 +12,19 @@ class FileType(Enum): + """Supported file types for SharePoint file extraction. + + Each file type maps to a corresponding pandas read function. + + Attributes: + EXCEL: Excel files (.xlsx, .xls) + CSV: Comma-separated values files + JSON: JSON format files + PARQUET: Apache Parquet files + SAS: SAS data files + SPSS: SPSS data files + SAV: SPSS SAV format files + """ EXCEL = "excel" CSV = "csv" JSON = "json" @@ -16,6 +34,11 @@ class FileType(Enum): SAV = "sav" def get_pd_function(self): + """Get the pandas read function for this file type. + + Returns: + Callable pandas read function (e.g., pd.read_csv, pd.read_excel) + """ return { self.EXCEL: pd.read_excel, self.CSV: pd.read_csv, @@ -27,6 +50,17 @@ def get_pd_function(self): class SharepointListConfig(BaseModel): + """Configuration for SharePoint list data extraction. + + Attributes: + table_name: Name of the destination table for the list data + list_title: Display name of the SharePoint list to extract + select: Optional comma-separated field names to retrieve + is_incremental: Enable incremental loading (not yet implemented) + + Raises: + NotImplementedError: If is_incremental is set to True + """ table_name: str list_title: str select: Optional[str] = None @@ -41,6 +75,21 @@ def __init__(self, **data): class SharepointFilesConfig(BaseModel): + """Configuration for SharePoint file extraction and processing. + + Attributes: + file_type: Type of files to process (CSV, Excel, etc.) + folder_path: Path to the SharePoint folder containing files + table_name: Name of the destination table for file data + file_name_startswith: Prefix filter for file names + pattern: Optional regex pattern for additional file filtering + pandas_kwargs: Additional arguments to pass to pandas read function + is_file_incremental: Enable incremental loading based on file modification time + + Note: + The pattern attribute is automatically prefixed with file_name_startswith. + Folder paths are validated and normalized during initialization. + """ file_type: FileType folder_path: str table_name: str @@ -56,6 +105,20 @@ def __init__(self, **data): def validate_folder_path(folder_path: str) -> str: + """Validate and normalize a SharePoint folder path. + + Removes leading/trailing slashes and validates that the path contains + only allowed characters (alphanumeric, dashes, underscores, spaces, dots). + + Args: + folder_path: The folder path to validate + + Returns: + Normalized folder path without leading/trailing slashes + + Raises: + ValueError: If path contains invalid characters or double slashes + """ if folder_path.startswith("/"): folder_path = folder_path[1:] if folder_path.endswith("/"): diff --git a/tests/sharepoint/__init__.py b/tests/sharepoint/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sharepoint/test_sharepoint_source.py b/tests/sharepoint/test_sharepoint_source.py new file mode 100644 index 000000000..b40782a08 --- /dev/null +++ b/tests/sharepoint/test_sharepoint_source.py @@ -0,0 +1,651 @@ +import pytest +from unittest.mock import Mock, patch, MagicMock +from io import BytesIO +import pandas as pd +from typing import Dict, List + +import dlt +from sources.sharepoint import ( + sharepoint_list, + sharepoint_files, + SharepointCredentials, +) +from sources.sharepoint.sharepoint_files_config import ( + SharepointFilesConfig, + SharepointListConfig, + FileType, + validate_folder_path, +) +from sources.sharepoint.helpers import SharepointClient + +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + load_table_counts, +) + + +# Mock credentials for testing +MOCK_CREDENTIALS = { + "client_id": "test_client_id", + "tenant_id": "test_tenant_id", + "site_id": "test_site_id", + "client_secret": "test_client_secret", + "sub_site_id": "", +} + + +class TestSharepointFilesConfig: + """Test SharepointFilesConfig class""" + + def test_valid_config(self): + """Test creating a valid SharepointFilesConfig""" + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="/Documents/Reports", + table_name="test_table", + file_name_startswith="report", + pattern=r".*\.csv$", + pandas_kwargs={"sep": ","}, + is_file_incremental=True, + ) + assert config.file_type == FileType.CSV + assert config.folder_path == "Documents/Reports" + assert config.table_name == "test_table" + assert config.file_name_startswith == "report" + assert config.pattern == r"^report.*\.csv$" + assert config.pandas_kwargs == {"sep": ","} + assert config.is_file_incremental is True + + def test_folder_path_normalization(self): + """Test that folder paths are normalized correctly""" + config = SharepointFilesConfig( + file_type=FileType.EXCEL, + folder_path="/Documents/", + table_name="test_table", + file_name_startswith="file", + ) + assert config.folder_path == "Documents" + + def test_pattern_prefix(self): + """Test that pattern is prefixed with file_name_startswith""" + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="report_", + pattern=r"\d{8}\.csv$", + ) + assert config.pattern == r"^report_\d{8}\.csv$" + + def test_get_pd_function(self): + """Test that get_pd_function returns correct pandas functions""" + assert FileType.CSV.get_pd_function() == pd.read_csv + assert FileType.EXCEL.get_pd_function() == pd.read_excel + assert FileType.JSON.get_pd_function() == pd.read_json + assert FileType.PARQUET.get_pd_function() == pd.read_parquet + + +class TestSharepointListConfig: + """Test SharepointListConfig class""" + + def test_valid_config(self): + """Test creating a valid SharepointListConfig""" + config = SharepointListConfig( + table_name="test_table", + list_title="Test List", + select="field1,field2", + is_incremental=False, + ) + assert config.table_name == "test_table" + assert config.list_title == "Test List" + assert config.select == "field1,field2" + assert config.is_incremental is False + + def test_incremental_not_implemented(self): + """Test that incremental loading raises NotImplementedError""" + with pytest.raises(NotImplementedError): + SharepointListConfig( + table_name="test_table", + list_title="Test List", + is_incremental=True, + ) + + +class TestValidateFolderPath: + """Test validate_folder_path function""" + + def test_remove_leading_slash(self): + """Test that leading slashes are removed""" + assert validate_folder_path("/Documents") == "Documents" + + def test_remove_trailing_slash(self): + """Test that trailing slashes are removed""" + assert validate_folder_path("Documents/") == "Documents" + + def test_remove_both_slashes(self): + """Test that both leading and trailing slashes are removed""" + assert validate_folder_path("/Documents/") == "Documents" + + def test_valid_path_with_subdirs(self): + """Test valid path with subdirectories""" + assert validate_folder_path("Documents/Reports/2024") == "Documents/Reports/2024" + + def test_valid_path_with_spaces(self): + """Test valid path with spaces""" + assert validate_folder_path("My Documents/My Reports") == "My Documents/My Reports" + + def test_invalid_characters(self): + """Test that invalid characters raise ValueError""" + with pytest.raises(ValueError, match="Invalid folder path"): + validate_folder_path("Documents/Reports@2024") + + def test_double_slashes(self): + """Test that double slashes raise ValueError""" + with pytest.raises(ValueError, match="Invalid folder path with double slashes"): + validate_folder_path("Documents//Reports") + + +class TestSharepointClient: + """Test SharepointClient class""" + + def test_client_initialization(self): + """Test SharepointClient initialization""" + client = SharepointClient(**MOCK_CREDENTIALS) + assert client.client_id == MOCK_CREDENTIALS["client_id"] + assert client.tenant_id == MOCK_CREDENTIALS["tenant_id"] + assert client.site_id == MOCK_CREDENTIALS["site_id"] + assert client.client_secret == MOCK_CREDENTIALS["client_secret"] + assert client.sub_site_id == "" + assert client.graph_site_url == f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}" + + def test_client_with_subsite(self): + """Test SharepointClient initialization with sub_site_id""" + credentials = MOCK_CREDENTIALS.copy() + credentials["sub_site_id"] = "sub_site_123" + client = SharepointClient(**credentials) + expected_url = f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}/sites/sub_site_123" + assert client.graph_site_url == expected_url + + def test_client_missing_credentials(self): + """Test that missing credentials raise ValueError""" + with pytest.raises(ValueError, match="client_id, tenant_id, client_secret and site_id are required"): + SharepointClient( + client_id="", + tenant_id="test", + site_id="test", + client_secret="test", + ) + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_connect_success(self, mock_rest_client, mock_msal_app): + """Test successful connection to SharePoint""" + # Mock MSAL token response + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = { + "access_token": "test_access_token" + } + mock_msal_app.return_value = mock_app_instance + + # Mock REST client + mock_client_instance = Mock() + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + assert client.client is not None + mock_msal_app.assert_called_once() + mock_rest_client.assert_called_once() + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + def test_connect_failure(self, mock_msal_app): + """Test failed connection to SharePoint""" + # Mock MSAL token response without access_token + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"error": "authentication_failed"} + mock_msal_app.return_value = mock_app_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + with pytest.raises(ConnectionError, match="Connection failed"): + client.connect() + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_all_lists_in_site(self, mock_rest_client, mock_msal_app): + """Test getting all lists from a site""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List 1", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList1", + "list": {"template": "genericList"}, + }, + { + "id": "list2", + "displayName": "Test List 2", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList2", + "list": {"template": "genericList"}, + }, + { + "id": "list3", + "displayName": "Document Library", + "webUrl": "https://test.sharepoint.com/sites/test/Shared Documents", + "list": {"template": "documentLibrary"}, + }, + ] + } + mock_response.raise_for_status = Mock() + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + lists = client.get_all_lists_in_site() + + assert len(lists) == 2 + assert all(item["list"]["template"] == "genericList" for item in lists) + assert all("Lists" in item["webUrl"] for item in lists) + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_items_from_list(self, mock_rest_client, mock_msal_app): + """Test getting items from a specific list""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + + # Mock response for get_all_lists_in_site + mock_lists_response = Mock() + mock_lists_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList", + "list": {"template": "genericList"}, + } + ] + } + mock_lists_response.raise_for_status = Mock() + + # Mock response for list items + mock_items_response = Mock() + mock_items_response.json.return_value = { + "value": [ + {"fields": {"Title": "Item 1", "Description": "Test item 1"}}, + {"fields": {"Title": "Item 2", "Description": "Test item 2"}}, + ] + } + mock_items_response.raise_for_status = Mock() + + mock_client_instance.get.side_effect = [mock_lists_response, mock_items_response] + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + items = client.get_items_from_list("Test List") + + assert len(items) == 2 + assert items[0]["Title"] == "Item 1" + assert items[1]["Title"] == "Item 2" + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_items_from_nonexistent_list(self, mock_rest_client, mock_msal_app): + """Test getting items from a list that doesn't exist""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "id": "list1", + "displayName": "Test List", + "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList", + "list": {"template": "genericList"}, + } + ] + } + mock_response.raise_for_status = Mock() + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + with pytest.raises(ValueError, match="List with title 'Nonexistent List' not found"): + client.get_items_from_list("Nonexistent List") + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_files_from_path(self, mock_rest_client, mock_msal_app): + """Test getting files from a folder path""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "value": [ + { + "name": "report_2024.csv", + "file": {}, + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + }, + { + "name": "report_2023.csv", + "file": {}, + "lastModifiedDateTime": "2023-01-01T00:00:00Z", + }, + { + "name": "subfolder", + "folder": {}, + }, + ] + } + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + files = client.get_files_from_path("Documents", "report", pattern=r".*2024.*") + + assert len(files) == 1 + assert files[0]["name"] == "report_2024.csv" + + @patch("sources.sharepoint.helpers.ConfidentialClientApplication") + @patch("sources.sharepoint.helpers.RESTClient") + def test_get_file_bytes_io(self, mock_rest_client, mock_msal_app): + """Test downloading a file to BytesIO""" + # Setup mocks + mock_app_instance = Mock() + mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"} + mock_msal_app.return_value = mock_app_instance + + mock_client_instance = Mock() + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b"test file content" + mock_client_instance.get.return_value = mock_response + mock_rest_client.return_value = mock_client_instance + + client = SharepointClient(**MOCK_CREDENTIALS) + client.connect() + + file_item = { + "name": "test.csv", + "@microsoft.graph.downloadUrl": "https://test.sharepoint.com/download/test.csv", + } + bytes_io = client.get_file_bytes_io(file_item) + + assert isinstance(bytes_io, BytesIO) + assert bytes_io.getvalue() == b"test file content" + + +class TestSharepointListSource: + """Test sharepoint_list source""" + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_list_source(self, mock_client_class): + """Test sharepoint_list source yields data correctly""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_items_from_list.return_value = [ + {"Title": "Item 1", "Field1": "Value1"}, + {"Title": "Item 2", "Field1": "Value2"}, + ] + mock_client_class.return_value = mock_client_instance + + # Create config + config = SharepointListConfig( + table_name="test_table", + list_title="Test List", + select="Title,Field1", + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_list(config, credentials=credentials) + + # Extract data + resources = list(source) + assert len(resources) == 1 + + # Get data from resource + resource_data = list(resources[0]) + assert len(resource_data) == 2 + assert resource_data[0]["Title"] == "Item 1" + assert resource_data[1]["Title"] == "Item 2" + + +class TestSharepointFilesSource: + """Test sharepoint_files source""" + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_csv(self, mock_client_class): + """Test sharepoint_files source with CSV files""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "report.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/report.csv", + } + ] + + # Create test CSV data + csv_data = b"col1,col2\nval1,val2\nval3,val4" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="report", + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract data + resources = list(source) + assert len(resources) == 1 + + # Get data from resource - this should yield file items first, then dataframes + all_data = list(resources[0]) + + # The transformer should yield dataframes + assert len(all_data) > 0 + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_incremental(self, mock_client_class): + """Test sharepoint_files source with incremental loading""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "old_file.csv", + "lastModifiedDateTime": "2020-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/old_file.csv", + }, + { + "name": "new_file.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/new_file.csv", + }, + ] + + csv_data = b"col1,col2\nval1,val2" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config with incremental loading + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="file", + is_file_incremental=True, + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract data + resources = list(source) + assert len(resources) == 1 + + @patch("sources.sharepoint.SharepointClient") + def test_sharepoint_files_source_with_chunks(self, mock_client_class): + """Test sharepoint_files source with chunked reading""" + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "large_file.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/large_file.csv", + } + ] + + # Create larger CSV data + csv_data = b"col1,col2\n" + b"\n".join([b"val1,val2" for _ in range(100)]) + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create config with chunksize + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_table", + file_name_startswith="large", + pandas_kwargs={"chunksize": 10}, + ) + + # Create source + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + + # Extract data + resources = list(source) + assert len(resources) == 1 + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_sharepoint_list_pipeline(destination_name: str) -> None: + """Integration test for sharepoint_list pipeline""" + + with patch("sources.sharepoint.SharepointClient") as mock_client_class: + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"} + mock_client_instance.get_items_from_list.return_value = [ + {"Title": "Item 1", "Description": "Description 1", "Status": "Active"}, + {"Title": "Item 2", "Description": "Description 2", "Status": "Completed"}, + {"Title": "Item 3", "Description": "Description 3", "Status": "Active"}, + ] + mock_client_class.return_value = mock_client_instance + + # Create pipeline + pipeline = dlt.pipeline( + pipeline_name="test_sharepoint_list", + destination=destination_name, + dataset_name="sharepoint_list_test", + dev_mode=True, + ) + + # Create config + config = SharepointListConfig( + table_name="test_items", + list_title="Test List", + ) + + # Create source and run pipeline + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_list(config, credentials=credentials) + load_info = pipeline.run(source) + + # Assert load info + assert_load_info(load_info) + + # Check table counts + table_counts = load_table_counts(pipeline, "test_items") + assert table_counts["test_items"] == 3 + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_sharepoint_files_pipeline(destination_name: str) -> None: + """Integration test for sharepoint_files pipeline""" + + with patch("sources.sharepoint.SharepointClient") as mock_client_class: + # Setup mock client + mock_client_instance = Mock() + mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"} + mock_client_instance.get_files_from_path.return_value = [ + { + "name": "data.csv", + "lastModifiedDateTime": "2024-01-01T00:00:00Z", + "@microsoft.graph.downloadUrl": "https://test.com/data.csv", + } + ] + + # Create test CSV data + csv_data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,Chicago" + mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data) + mock_client_class.return_value = mock_client_instance + + # Create pipeline + pipeline = dlt.pipeline( + pipeline_name="test_sharepoint_files", + destination=destination_name, + dataset_name="sharepoint_files_test", + dev_mode=True, + ) + + # Create config + config = SharepointFilesConfig( + file_type=FileType.CSV, + folder_path="Documents", + table_name="test_data", + file_name_startswith="data", + ) + + # Create source and run pipeline + credentials = SharepointCredentials(**MOCK_CREDENTIALS) + source = sharepoint_files(config, credentials=credentials) + load_info = pipeline.run(source) + + # Assert load info + assert_load_info(load_info) + + # Check table counts + table_counts = load_table_counts(pipeline, "test_data") + assert table_counts["test_data"] == 3 From 0a76475fe164f0123bb0ba07e76059f6bc60ad97 Mon Sep 17 00:00:00 2001 From: sd41847 Date: Mon, 8 Dec 2025 15:28:30 +0100 Subject: [PATCH 9/9] refactor: Update resource extraction in SharepointListSource and SharepointFilesSource tests for clarity --- tests/sharepoint/test_sharepoint_source.py | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/sharepoint/test_sharepoint_source.py b/tests/sharepoint/test_sharepoint_source.py index b40782a08..492bdd456 100644 --- a/tests/sharepoint/test_sharepoint_source.py +++ b/tests/sharepoint/test_sharepoint_source.py @@ -427,8 +427,8 @@ def test_sharepoint_list_source(self, mock_client_class): credentials = SharepointCredentials(**MOCK_CREDENTIALS) source = sharepoint_list(config, credentials=credentials) - # Extract data - resources = list(source) + # Extract resources from source + resources = list(source.resources.values()) assert len(resources) == 1 # Get data from resource @@ -436,8 +436,6 @@ def test_sharepoint_list_source(self, mock_client_class): assert len(resource_data) == 2 assert resource_data[0]["Title"] == "Item 1" assert resource_data[1]["Title"] == "Item 2" - - class TestSharepointFilesSource: """Test sharepoint_files source""" @@ -472,11 +470,11 @@ def test_sharepoint_files_source_csv(self, mock_client_class): credentials = SharepointCredentials(**MOCK_CREDENTIALS) source = sharepoint_files(config, credentials=credentials) - # Extract data - resources = list(source) + # Extract resources from source + resources = list(source.resources.values()) assert len(resources) == 1 - # Get data from resource - this should yield file items first, then dataframes + # Get data from resource - this should yield dataframes all_data = list(resources[0]) # The transformer should yield dataframes @@ -518,8 +516,8 @@ def test_sharepoint_files_source_incremental(self, mock_client_class): credentials = SharepointCredentials(**MOCK_CREDENTIALS) source = sharepoint_files(config, credentials=credentials) - # Extract data - resources = list(source) + # Extract resources from source + resources = list(source.resources.values()) assert len(resources) == 1 @patch("sources.sharepoint.SharepointClient") @@ -554,10 +552,15 @@ def test_sharepoint_files_source_with_chunks(self, mock_client_class): credentials = SharepointCredentials(**MOCK_CREDENTIALS) source = sharepoint_files(config, credentials=credentials) - # Extract data - resources = list(source) + # Extract resources from source + resources = list(source.resources.values()) assert len(resources) == 1 + # Get data from resource - with chunksize, this yields multiple dataframes (chunks) + all_chunks = list(resources[0]) + # Should have 10 chunks (100 rows / 10 rows per chunk) + assert len(all_chunks) == 10 + @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_sharepoint_list_pipeline(destination_name: str) -> None: