From 89307f0bc98c2e9cfadc0d0b4be697b3093c9906 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:33:57 +0200
Subject: [PATCH 1/9] feat: Implement SharePoint data extraction sources and
 configuration classes

---
 sources/sharepoint/__init__.py                | 106 ++++++++
 sources/sharepoint/helpers.py                 | 245 ++++++++++++++++++
 sources/sharepoint/sharepoint_files_config.py |  76 ++++++
 sources/sharepoint_pipeline.py                |  62 +++++
 4 files changed, 489 insertions(+)
 create mode 100644 sources/sharepoint/__init__.py
 create mode 100644 sources/sharepoint/helpers.py
 create mode 100644 sources/sharepoint/sharepoint_files_config.py
 create mode 100644 sources/sharepoint_pipeline.py

diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py
new file mode 100644
index 000000000..4d53d983f
--- /dev/null
+++ b/sources/sharepoint/__init__.py
@@ -0,0 +1,106 @@
+from typing import Iterator, Dict
+import re
+
+import dlt
+from dlt.common.typing import TDataItems
+from dlt.common.configuration.specs import configspec, BaseConfiguration
+from loguru import logger
+import pandas as pd
+
+from .helpers import SharepointClient
+from .sharepoint_files_config import SharepointFilesConfig, SharepointListConfig
+
+
+@configspec
+class SharepointCredentials(BaseConfiguration):
+    client_id: str = None
+    tenant_id: str = None
+    site_id: str = None
+    client_secret: str = None
+    sub_site_id: str = ""
+
+
+@dlt.source(name="sharepoint_list", max_table_nesting=0)
+def sharepoint_list(
+    sharepoint_list_config: SharepointListConfig,
+    credentials: SharepointCredentials = dlt.secrets.value,
+) -> Iterator[Dict[str, str]]:
+    client: SharepointClient = SharepointClient(**credentials)
+    client.connect()
+    logger.info(f"Connected to SharePoint site: {client.site_info}")
+
+    def get_pipe(sharepoint_list_config: SharepointListConfig):
+        def get_records(sharepoint_list_config: SharepointListConfig):
+            data = client.get_items_from_list(list_title=sharepoint_list_config.list_title, select=sharepoint_list_config.select)
+            yield from data
+        return dlt.resource(get_records, name=sharepoint_list_config.table_name)(sharepoint_list_config)
+    yield get_pipe(sharepoint_list_config=sharepoint_list_config)
+
+
+@dlt.source(name="sharepoint_files", max_table_nesting=0)
+def sharepoint_files(
+    sharepoint_files_config: SharepointFilesConfig,
+    credentials: SharepointCredentials = dlt.secrets.value,
+):
+    client: SharepointClient = SharepointClient(**credentials)
+    client.connect()
+    logger.info(f"Connected to SharePoint site: {client.site_info}")
+
+    def get_files(
+        sharepoint_files_config: SharepointFilesConfig,
+        last_update_timestamp: dlt.sources.incremental = dlt.sources.incremental(
+            cursor_path="lastModifiedDateTime",
+            initial_value="2020-01-01T00:00:00Z",
+            primary_key=(),
+        ),
+    ):
+        current_last_value = last_update_timestamp.last_value
+        logger.debug(f"current_last_value: {current_last_value}")
+        for file_item in client.get_files_from_path(
+            folder_path=sharepoint_files_config.folder_path,
+            file_name_startswith=sharepoint_files_config.file_name_startswith,
+            pattern=sharepoint_files_config.pattern,
+        ):
+            if file_item["size"] > sharepoint_files_config.file_size_limit:
+                logger.warning(f"File {file_item['name']} is too large, skipping")
+                raise RuntimeError(
+                    f"File {file_item['name']} is larger than the limit of"
+                    f" {sharepoint_files_config.file_size_limit} bytes."
+                )
+            logger.debug(
+                "filtering files based on lastModifiedDateTime, compare to last_value:"
+                f" {current_last_value}"
+            )
+            if file_item["lastModifiedDateTime"] > current_last_value or not sharepoint_files_config.is_file_incremental:
+                logger.info(
+                    f"Processing file after lastModifiedDateTime filter: {file_item['name']}"
+                )
+
+                file_item["pd_function"] = sharepoint_files_config.file_type.get_pd_function()
+                file_item["pd_kwargs"] = sharepoint_files_config.pandas_kwargs
+                yield file_item
+            else:
+                logger.info(
+                    f"Skipping file {file_item['name']} based on lastModifiedDateTime filter"
+                )
+
+    def get_records(file_item: Dict) -> TDataItems:
+        chunksize = file_item["pd_kwargs"].get("chunksize", None)
+        file_io = client.get_file_bytes_io(file_item=file_item)
+
+        if chunksize:
+            with file_item["pd_function"](file_io, **file_item["pd_kwargs"]) as reader:
+                for num, chunk in enumerate(reader):
+                    logger.info(f"Processing chunk {num} of {file_item['name']}")
+                    yield chunk
+        else:
+            df = file_item["pd_function"](file_io, **file_item["pd_kwargs"])
+            yield df
+        logger.debug(f"get_records done for {file_item['name']}")
+
+    def get_pipe(sharepoint_files_config: SharepointFilesConfig):
+        return dlt.resource(get_files, name=f"{sharepoint_files_config.table_name}_files")(sharepoint_files_config) | dlt.transformer(
+            get_records, name=sharepoint_files_config.table_name, parallelized=False
+        )
+
+    yield get_pipe(sharepoint_files_config=sharepoint_files_config)
diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
new file mode 100644
index 000000000..7e534ba69
--- /dev/null
+++ b/sources/sharepoint/helpers.py
@@ -0,0 +1,245 @@
+from typing import Dict, Union, List, Tuple
+from io import BytesIO
+import re
+
+from msal import ConfidentialClientApplication
+from loguru import logger
+from dlt.sources.helpers.rest_client import RESTClient
+from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
+from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator
+
+
+class SharepointClient:
+    # * playground:  https://developer.microsoft.com/en-us/graph/graph-explorer
+    # * If the result contains more results, Microsoft Graph returns an @odata.nextLink property
+
+    def __init__(
+        self,
+        client_id: str,
+        tenant_id: str,
+        site_id: str,
+        client_secret: str,
+        sub_site_id: str = "",
+    ) -> None:
+        self.client_id = client_id
+        self.tenant_id = tenant_id
+        self.client_secret = client_secret
+        self.sub_site_id = sub_site_id
+        self.site_id = site_id
+        if not all([self.client_id, self.tenant_id, self.client_secret, self.site_id]):
+            raise ValueError(
+                "client_id, tenant_id, client_secret and site_id are required for connect to"
+                " SharePoint"
+            )
+        self.graph_api_url = "https://graph.microsoft.com/v1.0/sites"
+        self.graph_site_url = f"{self.graph_api_url}/{self.site_id}"
+        if self.sub_site_id:
+            self.graph_site_url += f"/sites/{self.sub_site_id}"
+
+    def connect(self) -> None:
+        authority = f"https://login.microsoftonline.com/{self.tenant_id}"
+        scope = ["https://graph.microsoft.com/.default"]
+
+        app = ConfidentialClientApplication(
+            self.client_id,
+            authority=authority,
+            client_credential=self.client_secret,
+        )
+
+        # Get the access token
+        token_response = app.acquire_token_for_client(scopes=scope)
+        access_token = token_response.get("access_token", None)
+
+        if access_token:
+            self.client = RESTClient(
+                base_url=self.graph_site_url,
+                auth=BearerTokenAuth(access_token),
+                paginator=JSONLinkPaginator(next_url_path="@odata.nextLink"),
+            )
+            logger.success(f"Connected to SharePoint site id: {self.site_id} successfully")
+        else:
+            raise ConnectionError("Connection failed : ", token_response)
+
+    @property
+    def sub_sites(self) -> List:
+        url = f"{self.graph_site_url}/sites"
+        response = self.client.get(url)
+        site_info = response.json()
+        if "value" in site_info:
+            return site_info["value"]
+        else:
+            logger.warning(f"No subsite found in {url}")
+
+    @property
+    def site_info(self) -> Dict:
+        url = f"{self.graph_site_url}"
+        response = self.client.get(url)
+        site_info = response.json()
+        if not "error" in site_info:
+            return site_info
+        else:
+            logger.warning(f"No site_info found in {url}")
+
+    def get_all_lists_in_site(self) -> List[Dict]:
+        url = f"{self.graph_site_url}/lists"
+        res = self.client.get(url)
+        res.raise_for_status()
+        lists_info = res.json()
+        if "value" in lists_info:
+            all_items = lists_info["value"]
+            filtered_lists = [
+                item for item in all_items
+                if item.get("list", {}).get("template") == "genericList"
+                and "Lists" in item.get("webUrl", "")
+            ]
+            return filtered_lists
+        else:
+            filtered_lists = []
+        if not filtered_lists:
+            logger.warning(f"No lists found in {url}")
+        return filtered_lists
+
+    def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]:
+        #TODO, pagination not yet implemented
+        logger.warning(
+            "Pagination is not implemented for get_items_from_list, "
+            "it will return only first page of items."
+        )
+        all_lists = self.get_all_lists_in_site()
+        filtered_lists = [
+            x for x in all_lists
+            if x.get("list", {}).get("template") == "genericList"
+            and "Lists" in x.get("webUrl", "")
+        ]
+
+        possible_list_titles = [x["displayName"] for x in filtered_lists]
+        if list_title not in possible_list_titles:
+            raise ValueError(
+                f"List with title '{list_title}' not found in site {self.site_id}. "
+                f"Available lists: {possible_list_titles}"
+            )
+
+        # Get the list ID
+        list_id = next(
+            x["id"] for x in filtered_lists if x["displayName"] == list_title
+        )
+
+        url = f"{self.graph_site_url}/lists/{list_id}/items?expand=fields"
+        if select:
+            url += f"(select={select})"
+        res = self.client.get(url)
+        res.raise_for_status()
+        items_info = res.json()
+
+        if "value" in items_info:
+            output = [x.get("fields", {}) for x in items_info["value"]]
+        else:
+            output = []
+        if output:
+            logger.info(f"Got {len(output)} items from list: {list_title}")
+            return output
+        else:
+            logger.warning(f"No items found in list: {list_title}, with select: {select}")
+
+    def get_files_from_path(
+        self, folder_path: str, file_name_startswith: str, pattern: str = None
+    ) -> Dict:
+        folder_url = (
+            f"{self.graph_site_url}/drive/root:/{folder_path}:/children?$filter=startswith(name,"
+            f" '{file_name_startswith}')"
+        )
+        logger.debug(f"Getting files from folder with endpoint: {folder_url}")
+        res = self.client.get(folder_url)
+        file_and_folder_items = res.json().get("value", [])
+        file_items = [x for x in file_and_folder_items if "file" in x.keys()]
+        if pattern:
+            logger.debug(f"Filtering files with pattern: {pattern}")
+            file_items = [x for x in file_items if re.search(pattern, x["name"])]
+
+        logger.debug(f"Got number files from ms graph api: {len(file_items)}")
+        return file_items
+
+    def get_file_bytes_io(self, file_item: Dict):
+        file_url = file_item["@microsoft.graph.downloadUrl"]
+        response = self.client.get(file_url)
+        if response.status_code == 200:
+            bytes_io = BytesIO(response.content)
+            logger.info(
+                f"File {file_item['name']} downloaded to BytesIO, size: {len(bytes_io.getvalue())}"
+            )
+            return bytes_io
+        else:
+            raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded")
+
+    def archive_file(self, file_item: Dict, archive_folder_path: str, new_file_name: str) -> None:
+        url = f"{self.graph_site_url}/drive/items/{file_item['id']}"
+        archive_folder_path = self.remove_driver_root_in_path(archive_folder_path)
+        archive_folder_id = self.create_folder_if_not_exists(folder_path=archive_folder_path)
+        body = {
+            "parentReference": {"id": archive_folder_id},
+            "name": new_file_name,
+        }
+        res = self.client.patch(url, json=body)
+        if res.status_code == 200:
+            logger.success(
+                f"File {file_item['name']} renamed to {new_file_name} in {archive_folder_path}"
+            )
+        else:
+            raise RuntimeError(f"File {file_item['name']} can't be renamed to {new_file_name}")
+
+    def safe_get_folder_id(self, folder_path: str) -> Union[str, None]:
+        folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}"
+        res = self.client.get(folder_url)
+        if res.status_code == 200:
+            return res.json()["id"]
+
+    def list_folder(self, folder_path: str) -> Tuple[List, List]:
+        """List sub folders and files in folder_path
+
+        Args:
+            folder_path (str): folder_path from sharepoint
+
+        Returns:
+            Tuple[List, List]: (List of folders, List of files)
+        """
+        if r"/" not in folder_path:
+            raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'")
+        folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}:/children"
+        logger.info(f"Listing from folder_path: {folder_path} using {folder_url}")
+        res = self.client.get(folder_url)
+        file_and_folder_items = res.json().get("value", [])
+        file_items = [x for x in file_and_folder_items if "file" in x.keys()]
+        folder_items = [x for x in file_and_folder_items if "folder" in x.keys()]
+        return (folder_items, file_items)
+
+    def create_folder(self, folder_path: str) -> str:
+        if r"/" not in folder_path:
+            raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'")
+        parent_folder, folder_name = folder_path.rsplit("/", 1)
+        parent_folder_id = self.safe_get_folder_id(parent_folder)
+        if not parent_folder_id:
+            raise ValueError(f"Parent folder {parent_folder} not found")
+        logger.debug(f"Creating folder {folder_name} in {parent_folder}")
+        folder_url = f"{self.graph_site_url}/drive/items/{parent_folder_id}/children"
+        body = {
+            "name": folder_name,
+            "folder": {},
+            "@microsoft.graph.conflictBehavior": "fail",
+        }
+        res = self.client.post(folder_url, json=body)
+        if res.status_code == 201:
+            logger.success(f"Folder {folder_name} created")
+            return res.json()["id"]
+        else:
+            raise RuntimeError(f"Folder {folder_name} can't be created")
+
+    def create_folder_if_not_exists(self, folder_path: str) -> str:
+        folder_id = self.safe_get_folder_id(folder_path)
+        if folder_id:
+            logger.info(f"Folder {folder_path} already exists")
+            return folder_id
+        else:
+            return self.create_folder(folder_path)
+
+    def remove_driver_root_in_path(self, path: str) -> str:
+        return re.sub(r"^/drive/root:/", "", path)
diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
new file mode 100644
index 000000000..fd9c5ebd6
--- /dev/null
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -0,0 +1,76 @@
+from typing import Iterator, Optional, Sequence, List, Dict
+import re
+from enum import Enum
+
+from loguru import logger
+import pandas as pd
+from pydantic import BaseModel
+
+
+
+class FileType(Enum):
+    EXCEL = "excel"
+    CSV = "csv"
+    JSON = "json"
+    PARQUET = "parquet"
+    SAS = "sas"
+    SPSS = "spss"
+    SAV = "sav"
+
+    def get_pd_function(self):
+        return {
+            self.EXCEL: pd.read_excel,
+            self.CSV: pd.read_csv,
+            self.JSON: pd.read_json,
+            self.PARQUET: pd.read_parquet,
+            self.SAS: pd.read_sas,
+            self.SPSS: pd.read_spss,
+        }[self]
+
+
+class SharepointListConfig(BaseModel):
+    table_name: str
+    list_title: str
+    select: Optional[str] = None
+    limit: Optional[int] = None
+    is_incremental: Optional[bool] = False
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        if self.is_incremental is True:
+            raise NotImplementedError(
+                "Incremental loading for Sharepoint List is not implemented yet."
+            )
+
+class SharepointFilesConfig(BaseModel):
+    file_type: FileType
+    folder_path: str
+    table_name: str
+    file_name_startswith: str
+    pattern: Optional[str] = ".*"
+    pandas_kwargs: Dict = {}
+    limit: Optional[int] = None
+    file_size_limit: Optional[int] = 100_000_000  # 100 MB
+    is_compressed_folder: Optional[bool] = False
+    if_apply_str_to_all_columns: Optional[bool] = True
+    is_file_incremental: bool = False
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self.folder_path = validate_folder_path(self.folder_path)
+        self.pattern = f"^{self.file_name_startswith}{self.pattern}"
+
+
+def validate_folder_path(folder_path: str) -> str:
+    if folder_path.startswith("/"):
+        folder_path = folder_path[1:]
+    if folder_path.endswith("/"):
+        folder_path = folder_path[:-1]
+    if not re.compile(r"^[a-zA-Z0-9_\-/\s\.]*$").match(folder_path):
+        raise ValueError(
+            "Invalid folder path, only alphanumeric characters, dashes and underscores are"
+            f" allowed: {folder_path}"
+        )
+    if re.compile(r"//").search(folder_path):
+        raise ValueError(f"Invalid folder path with double slashes: {folder_path}")
+    return folder_path
diff --git a/sources/sharepoint_pipeline.py b/sources/sharepoint_pipeline.py
new file mode 100644
index 000000000..57b83b1ef
--- /dev/null
+++ b/sources/sharepoint_pipeline.py
@@ -0,0 +1,62 @@
+
+import dlt
+from sharepoint import sharepoint_list, sharepoint_files, SharepointCredentials
+from sharepoint.sharepoint_files_config import SharepointFilesConfig, SharepointListConfig
+
+if __name__ == "__main__":
+    # --- 1. Define SharePoint credentials ---
+    credentials = SharepointCredentials(
+        client_id="your-client-id",
+        tenant_id="your-tenant-id",
+        site_id="your-site-id",
+        client_secret="your-client-secret",
+        sub_site_id=""
+    )
+
+    # --- 2. Configure SharePoint list extraction ---
+    list_config = SharepointListConfig(
+        list_title="test_list",
+        select="Title,ins",
+        table_name="sharepoint_list_table"
+    )
+
+    # --- 3. Configure SharePoint file extraction ---
+    files_config = SharepointFilesConfig(
+        folder_path="General/sharepoint_test",
+        file_name_startswith="test_",
+        pattern=r".*\.csv$",
+        file_type="csv",
+        table_name="sharepoint_reports",
+        is_file_incremental=True,
+        file_size_limit=5_000_000,
+        pandas_kwargs={}
+    )
+
+    # --- 4. Create the DLT pipeline (destination = DuckDB) ---
+    pipeline = dlt.pipeline(
+        pipeline_name="sharepoint_to_duckdb",
+        destination="duckdb",
+        dataset_name="sharepoint_data",
+        full_refresh=False
+    )
+
+    # --- 5. Run both sources and load to DuckDB ---
+    print("Loading SharePoint List data...")
+    list_load_info = pipeline.run(
+        sharepoint_list(sharepoint_list_config=list_config, credentials=credentials)
+    )
+    print(list_load_info)
+    with pipeline.sql_client() as client:
+        df = client.execute("SELECT * FROM sharepoint_list_table LIMIT 10").df()
+        print(df)
+
+
+    print("Loading SharePoint Files data...")
+    files_load_info = pipeline.run(
+        sharepoint_files(sharepoint_files_config=files_config, credentials=credentials)
+    )
+    print(files_load_info)
+
+    with pipeline.sql_client() as client:
+        df = client.execute("SELECT * FROM sharepoint_reports LIMIT 10").df()
+        print(df)

From d9fe95cd9f167c302e01e071c1afa0db9ca7b2a1 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:36:56 +0200
Subject: [PATCH 2/9] Fix: Remove file size limit from SharePoint file
 configuration and extraction

---
 sources/sharepoint/__init__.py                | 6 ------
 sources/sharepoint/sharepoint_files_config.py | 1 -
 sources/sharepoint_pipeline.py                | 1 -
 3 files changed, 8 deletions(-)

diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py
index 4d53d983f..1740e1c70 100644
--- a/sources/sharepoint/__init__.py
+++ b/sources/sharepoint/__init__.py
@@ -61,12 +61,6 @@ def get_files(
             file_name_startswith=sharepoint_files_config.file_name_startswith,
             pattern=sharepoint_files_config.pattern,
         ):
-            if file_item["size"] > sharepoint_files_config.file_size_limit:
-                logger.warning(f"File {file_item['name']} is too large, skipping")
-                raise RuntimeError(
-                    f"File {file_item['name']} is larger than the limit of"
-                    f" {sharepoint_files_config.file_size_limit} bytes."
-                )
             logger.debug(
                 "filtering files based on lastModifiedDateTime, compare to last_value:"
                 f" {current_last_value}"
diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
index fd9c5ebd6..0e748c2c8 100644
--- a/sources/sharepoint/sharepoint_files_config.py
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -50,7 +50,6 @@ class SharepointFilesConfig(BaseModel):
     pattern: Optional[str] = ".*"
     pandas_kwargs: Dict = {}
     limit: Optional[int] = None
-    file_size_limit: Optional[int] = 100_000_000  # 100 MB
     is_compressed_folder: Optional[bool] = False
     if_apply_str_to_all_columns: Optional[bool] = True
     is_file_incremental: bool = False
diff --git a/sources/sharepoint_pipeline.py b/sources/sharepoint_pipeline.py
index 57b83b1ef..0e63653ff 100644
--- a/sources/sharepoint_pipeline.py
+++ b/sources/sharepoint_pipeline.py
@@ -28,7 +28,6 @@
         file_type="csv",
         table_name="sharepoint_reports",
         is_file_incremental=True,
-        file_size_limit=5_000_000,
         pandas_kwargs={}
     )
 

From dc6e8f5ce24d92d326b917dcba1f119d03663139 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:39:31 +0200
Subject: [PATCH 3/9] Refactor: Replace loguru logger with dlt.common logger in
 SharePoint modules

---
 sources/sharepoint/__init__.py                | 2 +-
 sources/sharepoint/helpers.py                 | 2 +-
 sources/sharepoint/sharepoint_files_config.py | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py
index 1740e1c70..3d5670b50 100644
--- a/sources/sharepoint/__init__.py
+++ b/sources/sharepoint/__init__.py
@@ -4,7 +4,7 @@
 import dlt
 from dlt.common.typing import TDataItems
 from dlt.common.configuration.specs import configspec, BaseConfiguration
-from loguru import logger
+from dlt.common import logger
 import pandas as pd
 
 from .helpers import SharepointClient
diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
index 7e534ba69..3ccf19fbe 100644
--- a/sources/sharepoint/helpers.py
+++ b/sources/sharepoint/helpers.py
@@ -3,7 +3,7 @@
 import re
 
 from msal import ConfidentialClientApplication
-from loguru import logger
+from dlt.common import logger
 from dlt.sources.helpers.rest_client import RESTClient
 from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
 from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator
diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
index 0e748c2c8..1daf69254 100644
--- a/sources/sharepoint/sharepoint_files_config.py
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -1,13 +1,11 @@
-from typing import Iterator, Optional, Sequence, List, Dict
+from typing import Optional, Dict
 import re
 from enum import Enum
 
-from loguru import logger
 import pandas as pd
 from pydantic import BaseModel
 
 
-
 class FileType(Enum):
     EXCEL = "excel"
     CSV = "csv"

From ad83f5709288c935c194be422e43b16bfd544b03 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:44:28 +0200
Subject: [PATCH 4/9] Refactor: Remove unused methods and imports from
 SharepointClient class

---
 sources/sharepoint/helpers.py | 75 +----------------------------------
 1 file changed, 1 insertion(+), 74 deletions(-)

diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
index 3ccf19fbe..e88e42f25 100644
--- a/sources/sharepoint/helpers.py
+++ b/sources/sharepoint/helpers.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union, List, Tuple
+from typing import Dict, List
 from io import BytesIO
 import re
 
@@ -170,76 +170,3 @@ def get_file_bytes_io(self, file_item: Dict):
             return bytes_io
         else:
             raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded")
-
-    def archive_file(self, file_item: Dict, archive_folder_path: str, new_file_name: str) -> None:
-        url = f"{self.graph_site_url}/drive/items/{file_item['id']}"
-        archive_folder_path = self.remove_driver_root_in_path(archive_folder_path)
-        archive_folder_id = self.create_folder_if_not_exists(folder_path=archive_folder_path)
-        body = {
-            "parentReference": {"id": archive_folder_id},
-            "name": new_file_name,
-        }
-        res = self.client.patch(url, json=body)
-        if res.status_code == 200:
-            logger.success(
-                f"File {file_item['name']} renamed to {new_file_name} in {archive_folder_path}"
-            )
-        else:
-            raise RuntimeError(f"File {file_item['name']} can't be renamed to {new_file_name}")
-
-    def safe_get_folder_id(self, folder_path: str) -> Union[str, None]:
-        folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}"
-        res = self.client.get(folder_url)
-        if res.status_code == 200:
-            return res.json()["id"]
-
-    def list_folder(self, folder_path: str) -> Tuple[List, List]:
-        """List sub folders and files in folder_path
-
-        Args:
-            folder_path (str): folder_path from sharepoint
-
-        Returns:
-            Tuple[List, List]: (List of folders, List of files)
-        """
-        if r"/" not in folder_path:
-            raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'")
-        folder_url = f"{self.graph_site_url}/drive/root:/{folder_path}:/children"
-        logger.info(f"Listing from folder_path: {folder_path} using {folder_url}")
-        res = self.client.get(folder_url)
-        file_and_folder_items = res.json().get("value", [])
-        file_items = [x for x in file_and_folder_items if "file" in x.keys()]
-        folder_items = [x for x in file_and_folder_items if "folder" in x.keys()]
-        return (folder_items, file_items)
-
-    def create_folder(self, folder_path: str) -> str:
-        if r"/" not in folder_path:
-            raise ValueError(f"Invalid folder path: {folder_path}, must contain '/'")
-        parent_folder, folder_name = folder_path.rsplit("/", 1)
-        parent_folder_id = self.safe_get_folder_id(parent_folder)
-        if not parent_folder_id:
-            raise ValueError(f"Parent folder {parent_folder} not found")
-        logger.debug(f"Creating folder {folder_name} in {parent_folder}")
-        folder_url = f"{self.graph_site_url}/drive/items/{parent_folder_id}/children"
-        body = {
-            "name": folder_name,
-            "folder": {},
-            "@microsoft.graph.conflictBehavior": "fail",
-        }
-        res = self.client.post(folder_url, json=body)
-        if res.status_code == 201:
-            logger.success(f"Folder {folder_name} created")
-            return res.json()["id"]
-        else:
-            raise RuntimeError(f"Folder {folder_name} can't be created")
-
-    def create_folder_if_not_exists(self, folder_path: str) -> str:
-        folder_id = self.safe_get_folder_id(folder_path)
-        if folder_id:
-            logger.info(f"Folder {folder_path} already exists")
-            return folder_id
-        else:
-            return self.create_folder(folder_path)
-
-    def remove_driver_root_in_path(self, path: str) -> str:
-        return re.sub(r"^/drive/root:/", "", path)

From c94cb2f888af479f27812d683f6685a0d643fe5d Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:52:33 +0200
Subject: [PATCH 5/9] fix: Change log level from success to info for SharePoint
 connection message

---
 sources/sharepoint/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
index e88e42f25..852e71a19 100644
--- a/sources/sharepoint/helpers.py
+++ b/sources/sharepoint/helpers.py
@@ -56,7 +56,7 @@ def connect(self) -> None:
                 auth=BearerTokenAuth(access_token),
                 paginator=JSONLinkPaginator(next_url_path="@odata.nextLink"),
             )
-            logger.success(f"Connected to SharePoint site id: {self.site_id} successfully")
+            logger.info(f"Connected to SharePoint site id: {self.site_id} successfully")
         else:
             raise ConnectionError("Connection failed : ", token_response)
 

From 07249844824307bab82703e8bdef072c49009ddd Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 15:54:52 +0200
Subject: [PATCH 6/9] refactor: Improve code readability by formatting and
 organizing function definitions in SharePoint modules

---
 sources/sharepoint/__init__.py                | 24 +++++++++++++++----
 sources/sharepoint/helpers.py                 | 18 +++++++++-----
 sources/sharepoint/sharepoint_files_config.py |  1 +
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py
index 3d5670b50..bb684add8 100644
--- a/sources/sharepoint/__init__.py
+++ b/sources/sharepoint/__init__.py
@@ -31,9 +31,16 @@ def sharepoint_list(
 
     def get_pipe(sharepoint_list_config: SharepointListConfig):
         def get_records(sharepoint_list_config: SharepointListConfig):
-            data = client.get_items_from_list(list_title=sharepoint_list_config.list_title, select=sharepoint_list_config.select)
+            data = client.get_items_from_list(
+                list_title=sharepoint_list_config.list_title,
+                select=sharepoint_list_config.select,
+            )
             yield from data
-        return dlt.resource(get_records, name=sharepoint_list_config.table_name)(sharepoint_list_config)
+
+        return dlt.resource(get_records, name=sharepoint_list_config.table_name)(
+            sharepoint_list_config
+        )
+
     yield get_pipe(sharepoint_list_config=sharepoint_list_config)
 
 
@@ -65,12 +72,17 @@ def get_files(
                 "filtering files based on lastModifiedDateTime, compare to last_value:"
                 f" {current_last_value}"
             )
-            if file_item["lastModifiedDateTime"] > current_last_value or not sharepoint_files_config.is_file_incremental:
+            if (
+                file_item["lastModifiedDateTime"] > current_last_value
+                or not sharepoint_files_config.is_file_incremental
+            ):
                 logger.info(
                     f"Processing file after lastModifiedDateTime filter: {file_item['name']}"
                 )
 
-                file_item["pd_function"] = sharepoint_files_config.file_type.get_pd_function()
+                file_item["pd_function"] = (
+                    sharepoint_files_config.file_type.get_pd_function()
+                )
                 file_item["pd_kwargs"] = sharepoint_files_config.pandas_kwargs
                 yield file_item
             else:
@@ -93,7 +105,9 @@ def get_records(file_item: Dict) -> TDataItems:
         logger.debug(f"get_records done for {file_item['name']}")
 
     def get_pipe(sharepoint_files_config: SharepointFilesConfig):
-        return dlt.resource(get_files, name=f"{sharepoint_files_config.table_name}_files")(sharepoint_files_config) | dlt.transformer(
+        return dlt.resource(
+            get_files, name=f"{sharepoint_files_config.table_name}_files"
+        )(sharepoint_files_config) | dlt.transformer(
             get_records, name=sharepoint_files_config.table_name, parallelized=False
         )
 
diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
index 852e71a19..e634bc374 100644
--- a/sources/sharepoint/helpers.py
+++ b/sources/sharepoint/helpers.py
@@ -88,7 +88,8 @@ def get_all_lists_in_site(self) -> List[Dict]:
         if "value" in lists_info:
             all_items = lists_info["value"]
             filtered_lists = [
-                item for item in all_items
+                item
+                for item in all_items
                 if item.get("list", {}).get("template") == "genericList"
                 and "Lists" in item.get("webUrl", "")
             ]
@@ -99,15 +100,16 @@ def get_all_lists_in_site(self) -> List[Dict]:
             logger.warning(f"No lists found in {url}")
         return filtered_lists
 
-    def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]:
-        #TODO, pagination not yet implemented
+    def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]:
+        # TODO, pagination not yet implemented
         logger.warning(
             "Pagination is not implemented for get_items_from_list, "
             "it will return only first page of items."
         )
         all_lists = self.get_all_lists_in_site()
         filtered_lists = [
-            x for x in all_lists
+            x
+            for x in all_lists
             if x.get("list", {}).get("template") == "genericList"
             and "Lists" in x.get("webUrl", "")
         ]
@@ -139,7 +141,9 @@ def get_items_from_list(self, list_title: str, select:str = None) -> List[Dict]:
             logger.info(f"Got {len(output)} items from list: {list_title}")
             return output
         else:
-            logger.warning(f"No items found in list: {list_title}, with select: {select}")
+            logger.warning(
+                f"No items found in list: {list_title}, with select: {select}"
+            )
 
     def get_files_from_path(
         self, folder_path: str, file_name_startswith: str, pattern: str = None
@@ -169,4 +173,6 @@ def get_file_bytes_io(self, file_item: Dict):
             )
             return bytes_io
         else:
-            raise FileNotFoundError(f"File not found: {file_item['name']} or can't be downloaded")
+            raise FileNotFoundError(
+                f"File not found: {file_item['name']} or can't be downloaded"
+            )
diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
index 1daf69254..1a8e4d8a1 100644
--- a/sources/sharepoint/sharepoint_files_config.py
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -40,6 +40,7 @@ def __init__(self, **data):
                 "Incremental loading for Sharepoint List is not implemented yet."
             )
 
+
 class SharepointFilesConfig(BaseModel):
     file_type: FileType
     folder_path: str

From 34776d1b62ab49e0d710e31432033e524bf65a78 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Thu, 23 Oct 2025 16:44:41 +0200
Subject: [PATCH 7/9] fix: Remove unused attributes from SharepointListConfig
 and SharepointFilesConfig classes

---
 sources/sharepoint/sharepoint_files_config.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
index 1a8e4d8a1..e1bc5a821 100644
--- a/sources/sharepoint/sharepoint_files_config.py
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -30,7 +30,6 @@ class SharepointListConfig(BaseModel):
     table_name: str
     list_title: str
     select: Optional[str] = None
-    limit: Optional[int] = None
     is_incremental: Optional[bool] = False
 
     def __init__(self, **data):
@@ -48,9 +47,6 @@ class SharepointFilesConfig(BaseModel):
     file_name_startswith: str
     pattern: Optional[str] = ".*"
     pandas_kwargs: Dict = {}
-    limit: Optional[int] = None
-    is_compressed_folder: Optional[bool] = False
-    if_apply_str_to_all_columns: Optional[bool] = True
     is_file_incremental: bool = False
 
     def __init__(self, **data):

From 933becdb09bdd73b07fe339d50e64b15cbc2ce39 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Mon, 8 Dec 2025 15:19:55 +0100
Subject: [PATCH 8/9] feat: Add SharePoint source with list and file extraction
 capabilities

- Updated `pyproject.toml` to include SharePoint dependencies.
- Created `README.md` for SharePoint source documentation.
- Implemented SharePoint client in `helpers.py` for API interaction.
- Added configuration classes in `sharepoint_files_config.py` for lists and files.
- Developed extraction functions in `__init__.py` for SharePoint lists and files.
- Created unit tests for SharePoint source in `test_sharepoint_source.py`.
- Added requirements file for SharePoint source dependencies.
---
 pyproject.toml                                |   5 +
 sources/sharepoint/README.md                  | 256 +++++++
 sources/sharepoint/__init__.py                |  58 ++
 sources/sharepoint/helpers.py                 |  91 ++-
 sources/sharepoint/requirements.txt           |   3 +
 sources/sharepoint/sharepoint_files_config.py |  63 ++
 tests/sharepoint/__init__.py                  |   0
 tests/sharepoint/test_sharepoint_source.py    | 651 ++++++++++++++++++
 8 files changed, 1125 insertions(+), 2 deletions(-)
 create mode 100644 sources/sharepoint/README.md
 create mode 100644 sources/sharepoint/requirements.txt
 create mode 100644 tests/sharepoint/__init__.py
 create mode 100644 tests/sharepoint/test_sharepoint_source.py

diff --git a/pyproject.toml b/pyproject.toml
index 5be781448..e7025cd35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,6 +93,10 @@ scrapy = [
     "scrapy>=2.11.0,<3",
     "twisted==22.10.0",
 ]
+sharepoint = [
+    "msal>=1.20.0",
+    "pandas>=2.0.0",
+]
 
 [tool.uv]
 default-groups = [
@@ -113,6 +117,7 @@ default-groups = [
     "airtable",
     "filesystem",
     "scrapy",
+    "sharepoint",
 ]
 
 # [tool.uv.sources]
diff --git a/sources/sharepoint/README.md b/sources/sharepoint/README.md
new file mode 100644
index 000000000..dc2e4afb0
--- /dev/null
+++ b/sources/sharepoint/README.md
@@ -0,0 +1,256 @@
+# SharePoint Source
+
+This source allows you to extract data from SharePoint lists and files using the Microsoft Graph API.
+
+## Features
+
+- Extract data from SharePoint lists
+- Download and process files from SharePoint document libraries
+- Support for multiple file formats (CSV, Excel, JSON, Parquet, SAS, SPSS)
+- Incremental loading support for files based on modification time
+- Flexible file filtering with regex patterns
+
+## Prerequisites
+
+Before using this source, you need:
+
+1. **Azure AD Application Registration** with the following:
+   - Client ID
+   - Tenant ID
+   - Client Secret
+   - Microsoft Graph API permissions:
+     - `Sites.Read.All` or `Sites.ReadWrite.All`
+     - `Files.Read.All` (for file operations)
+
+2. **SharePoint Site ID**: The unique identifier for your SharePoint site
+
+## Configuration
+
+### Credentials
+
+Configure your credentials in `secrets.toml`:
+
+```toml
+[sources.sharepoint]
+client_id = "your-client-id"
+tenant_id = "your-tenant-id"
+site_id = "your-site-id"
+client_secret = "your-client-secret"
+sub_site_id = ""  # Optional: for sub-sites
+```
+
+### SharePoint List Configuration
+
+```python
+from sharepoint.sharepoint_files_config import SharepointListConfig
+
+list_config = SharepointListConfig(
+    table_name="my_list_data",
+    list_title="My SharePoint List",
+    select="Title,Description,Status",  # Optional: specific fields
+    is_incremental=False  # Incremental not yet implemented
+)
+```
+
+### SharePoint Files Configuration
+
+```python
+from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType
+
+files_config = SharepointFilesConfig(
+    file_type=FileType.CSV,
+    folder_path="Documents/Reports",
+    table_name="reports_data",
+    file_name_startswith="report_",
+    pattern=r".*\.csv$",  # Optional: regex pattern for filtering
+    pandas_kwargs={"sep": ","},  # Optional: pandas read options
+    is_file_incremental=True  # Enable incremental loading
+)
+```
+
+## Usage Examples
+
+### Example 1: Load SharePoint List Data
+
+```python
+import dlt
+from sharepoint import sharepoint_list, SharepointCredentials
+from sharepoint.sharepoint_files_config import SharepointListConfig
+
+# Configure credentials
+credentials = SharepointCredentials()
+
+# Configure list extraction
+list_config = SharepointListConfig(
+    table_name="tasks",
+    list_title="Project Tasks"
+)
+
+# Create and run pipeline
+pipeline = dlt.pipeline(
+    pipeline_name="sharepoint_list",
+    destination="duckdb",
+    dataset_name="sharepoint_data"
+)
+
+load_info = pipeline.run(
+    sharepoint_list(
+        sharepoint_list_config=list_config,
+        credentials=credentials
+    )
+)
+print(load_info)
+```
+
+### Example 2: Load Files from SharePoint
+
+```python
+import dlt
+from sharepoint import sharepoint_files, SharepointCredentials
+from sharepoint.sharepoint_files_config import SharepointFilesConfig, FileType
+
+# Configure credentials
+credentials = SharepointCredentials()
+
+# Configure file extraction
+files_config = SharepointFilesConfig(
+    file_type=FileType.CSV,
+    folder_path="Shared Documents/Reports",
+    table_name="monthly_reports",
+    file_name_startswith="report_",
+    pattern=r"202[4-5].*\.csv$",
+    is_file_incremental=True,
+    pandas_kwargs={"sep": ",", "encoding": "utf-8"}
+)
+
+# Create and run pipeline
+pipeline = dlt.pipeline(
+    pipeline_name="sharepoint_files",
+    destination="duckdb",
+    dataset_name="sharepoint_data"
+)
+
+load_info = pipeline.run(
+    sharepoint_files(
+        sharepoint_files_config=files_config,
+        credentials=credentials
+    )
+)
+print(load_info)
+```
+
+### Example 3: Process Excel Files with Chunking
+
+```python
+files_config = SharepointFilesConfig(
+    file_type=FileType.EXCEL,
+    folder_path="Reports/Annual",
+    table_name="large_report",
+    file_name_startswith="annual_",
+    pandas_kwargs={
+        "sheet_name": "Data",
+        "chunksize": 1000  # Process in chunks of 1000 rows
+    }
+)
+```
+
+## Supported File Types
+
+The source supports the following file types via pandas:
+
+- `FileType.CSV` - CSV files
+- `FileType.EXCEL` - Excel files (.xlsx, .xls)
+- `FileType.JSON` - JSON files
+- `FileType.PARQUET` - Parquet files
+- `FileType.SAS` - SAS files
+- `FileType.SPSS` - SPSS files
+
+## Incremental Loading
+
+### File Incremental Loading
+
+When `is_file_incremental=True`, the source tracks the `lastModifiedDateTime` of files and only processes files that have been modified since the last run.
+
+```python
+files_config = SharepointFilesConfig(
+    file_type=FileType.CSV,
+    folder_path="Documents",
+    table_name="data",
+    file_name_startswith="data_",
+    is_file_incremental=True  # Only process new/modified files
+)
+```
+
+### List Incremental Loading
+
+Incremental loading for SharePoint lists is not yet implemented.
+
+## Advanced Configuration
+
+### Folder Path Validation
+
+Folder paths are automatically normalized:
+- Leading/trailing slashes are removed
+- Double slashes are not allowed
+- Only alphanumeric characters, dashes, underscores, spaces, and dots are allowed
+
+### Pattern Matching
+
+The `pattern` parameter is automatically prefixed with `file_name_startswith`. For example:
+
+```python
+files_config = SharepointFilesConfig(
+    file_name_startswith="report_",
+    pattern=r"\d{8}\.csv$"
+)
+# Effective pattern: ^report_\d{8}\.csv$
+```
+
+### Pandas Kwargs
+
+Any pandas read function parameters can be passed via `pandas_kwargs`:
+
+```python
+files_config = SharepointFilesConfig(
+    file_type=FileType.CSV,
+    folder_path="Documents",
+    table_name="data",
+    file_name_startswith="data",
+    pandas_kwargs={
+        "sep": ";",
+        "encoding": "latin1",
+        "decimal": ",",
+        "chunksize": 5000
+    }
+)
+```
+
+## Troubleshooting
+
+### Authentication Issues
+
+If you encounter authentication errors:
+1. Verify your Client ID, Tenant ID, and Client Secret are correct
+2. Ensure your Azure AD app has the required permissions
+3. Check that admin consent has been granted for the permissions
+
+### File Not Found
+
+If files are not being found:
+1. Verify the folder path is correct (case-sensitive)
+2. Check that the file name pattern matches your files
+3. Ensure your app has access to the SharePoint site and folder
+
+### Permission Errors
+
+Ensure your Azure AD application has been granted:
+- `Sites.Read.All` or `Sites.ReadWrite.All`
+- `Files.Read.All`
+
+And that admin consent has been provided for these permissions.
+
+## Resources
+
+- [Microsoft Graph API Documentation](https://learn.microsoft.com/en-us/graph/api/overview)
+- [SharePoint REST API](https://learn.microsoft.com/en-us/sharepoint/dev/sp-add-ins/get-to-know-the-sharepoint-rest-service)
+- [Azure AD App Registration](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app)
diff --git a/sources/sharepoint/__init__.py b/sources/sharepoint/__init__.py
index bb684add8..b61d84fe2 100644
--- a/sources/sharepoint/__init__.py
+++ b/sources/sharepoint/__init__.py
@@ -1,3 +1,8 @@
+"""SharePoint data source for dlt.
+
+Provides sources for extracting data from SharePoint lists and files
+using the Microsoft Graph API.
+"""
 from typing import Iterator, Dict
 import re
 
@@ -13,6 +18,15 @@
 
 @configspec
 class SharepointCredentials(BaseConfiguration):
+    """Credentials for SharePoint authentication via Azure AD.
+
+    Attributes:
+        client_id: Azure AD application client ID
+        tenant_id: Azure AD tenant ID
+        site_id: SharePoint site ID
+        client_secret: Azure AD application client secret
+        sub_site_id: Optional sub-site ID for nested sites
+    """
     client_id: str = None
     tenant_id: str = None
     site_id: str = None
@@ -25,6 +39,26 @@ def sharepoint_list(
     sharepoint_list_config: SharepointListConfig,
     credentials: SharepointCredentials = dlt.secrets.value,
 ) -> Iterator[Dict[str, str]]:
+    """Extract data from a SharePoint list.
+
+    This source connects to SharePoint using Microsoft Graph API and retrieves
+    items from a specified list.
+
+    Args:
+        sharepoint_list_config: Configuration for the SharePoint list extraction
+        credentials: SharePoint authentication credentials
+
+    Yields:
+        DLT resource containing SharePoint list items
+
+    Example:
+        >>> config = SharepointListConfig(
+        ...     table_name="tasks",
+        ...     list_title="Project Tasks"
+        ... )
+        >>> source = sharepoint_list(config)
+        >>> pipeline.run(source)
+    """
     client: SharepointClient = SharepointClient(**credentials)
     client.connect()
     logger.info(f"Connected to SharePoint site: {client.site_info}")
@@ -49,6 +83,30 @@ def sharepoint_files(
     sharepoint_files_config: SharepointFilesConfig,
     credentials: SharepointCredentials = dlt.secrets.value,
 ):
+    """Extract and process files from SharePoint document libraries.
+
+    This source downloads files from SharePoint based on the configuration,
+    processes them using pandas, and yields the data for loading.
+
+    Supports incremental loading based on file modification time.
+
+    Args:
+        sharepoint_files_config: Configuration for file extraction and processing
+        credentials: SharePoint authentication credentials
+
+    Yields:
+        DLT resource containing processed file data
+
+    Example:
+        >>> config = SharepointFilesConfig(
+        ...     file_type=FileType.CSV,
+        ...     folder_path="Documents/Reports",
+        ...     table_name="reports",
+        ...     file_name_startswith="report_"
+        ... )
+        >>> source = sharepoint_files(config)
+        >>> pipeline.run(source)
+    """
     client: SharepointClient = SharepointClient(**credentials)
     client.connect()
     logger.info(f"Connected to SharePoint site: {client.site_info}")
diff --git a/sources/sharepoint/helpers.py b/sources/sharepoint/helpers.py
index e634bc374..43aed4ff4 100644
--- a/sources/sharepoint/helpers.py
+++ b/sources/sharepoint/helpers.py
@@ -1,3 +1,4 @@
+"""Helper module for SharePoint data extraction using Microsoft Graph API."""
 from typing import Dict, List
 from io import BytesIO
 import re
@@ -10,8 +11,21 @@
 
 
 class SharepointClient:
-    # * playground:  https://developer.microsoft.com/en-us/graph/graph-explorer
-    # * If the result contains more results, Microsoft Graph returns an @odata.nextLink property
+    """Client for interacting with SharePoint via Microsoft Graph API.
+
+    This client handles authentication and provides methods to retrieve lists,
+    list items, and files from SharePoint sites.
+
+    Attributes:
+        client_id: Azure AD application client ID
+        tenant_id: Azure AD tenant ID
+        site_id: SharePoint site ID
+        client_secret: Azure AD application client secret
+        sub_site_id: Optional sub-site ID for nested sites
+        graph_api_url: Base URL for Microsoft Graph API
+        graph_site_url: Full URL for the specific SharePoint site
+        client: REST client instance (set after connect())
+    """
 
     def __init__(
         self,
@@ -21,6 +35,18 @@ def __init__(
         client_secret: str,
         sub_site_id: str = "",
     ) -> None:
+        """Initialize SharePoint client with credentials.
+
+        Args:
+            client_id: Azure AD application client ID
+            tenant_id: Azure AD tenant ID
+            site_id: SharePoint site ID
+            client_secret: Azure AD application client secret
+            sub_site_id: Optional sub-site ID for nested sites
+
+        Raises:
+            ValueError: If any required credentials are missing
+        """
         self.client_id = client_id
         self.tenant_id = tenant_id
         self.client_secret = client_secret
@@ -37,6 +63,14 @@ def __init__(
             self.graph_site_url += f"/sites/{self.sub_site_id}"
 
     def connect(self) -> None:
+        """Establish connection to SharePoint using MSAL authentication.
+
+        Acquires an access token using client credentials flow and initializes
+        the REST client with bearer token authentication.
+
+        Raises:
+            ConnectionError: If authentication fails or access token cannot be obtained
+        """
         authority = f"https://login.microsoftonline.com/{self.tenant_id}"
         scope = ["https://graph.microsoft.com/.default"]
 
@@ -62,6 +96,11 @@ def connect(self) -> None:
 
     @property
     def sub_sites(self) -> List:
+        """Get list of sub-sites within the current SharePoint site.
+
+        Returns:
+            List of sub-site information dictionaries
+        """
         url = f"{self.graph_site_url}/sites"
         response = self.client.get(url)
         site_info = response.json()
@@ -72,6 +111,11 @@ def sub_sites(self) -> List:
 
     @property
     def site_info(self) -> Dict:
+        """Get information about the current SharePoint site.
+
+        Returns:
+            Dictionary containing site metadata and properties
+        """
         url = f"{self.graph_site_url}"
         response = self.client.get(url)
         site_info = response.json()
@@ -81,6 +125,14 @@ def site_info(self) -> Dict:
             logger.warning(f"No site_info found in {url}")
 
     def get_all_lists_in_site(self) -> List[Dict]:
+        """Retrieve all generic lists from the SharePoint site.
+
+        Filters for lists with template type 'genericList' and 'Lists' in their URL,
+        excluding document libraries and other non-list items.
+
+        Returns:
+            List of dictionaries containing list metadata
+        """
         url = f"{self.graph_site_url}/lists"
         res = self.client.get(url)
         res.raise_for_status()
@@ -101,6 +153,20 @@ def get_all_lists_in_site(self) -> List[Dict]:
         return filtered_lists
 
     def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]:
+        """Retrieve items from a specific SharePoint list.
+
+        Note: Pagination is not yet implemented; only the first page is returned.
+
+        Args:
+            list_title: Display name of the SharePoint list
+            select: Optional comma-separated string of field names to retrieve
+
+        Returns:
+            List of dictionaries containing list item field values
+
+        Raises:
+            ValueError: If the specified list is not found in the site
+        """
         # TODO, pagination not yet implemented
         logger.warning(
             "Pagination is not implemented for get_items_from_list, "
@@ -148,6 +214,16 @@ def get_items_from_list(self, list_title: str, select: str = None) -> List[Dict]
     def get_files_from_path(
         self, folder_path: str, file_name_startswith: str, pattern: str = None
     ) -> Dict:
+        """Get files from a SharePoint folder matching specified criteria.
+
+        Args:
+            folder_path: Path to the folder within SharePoint (e.g., 'Documents/Reports')
+            file_name_startswith: Prefix that file names must start with
+            pattern: Optional regex pattern for additional filtering
+
+        Returns:
+            List of file item dictionaries containing metadata and download URLs
+        """
         folder_url = (
             f"{self.graph_site_url}/drive/root:/{folder_path}:/children?$filter=startswith(name,"
             f" '{file_name_startswith}')"
@@ -164,6 +240,17 @@ def get_files_from_path(
         return file_items
 
     def get_file_bytes_io(self, file_item: Dict):
+        """Download a SharePoint file to a BytesIO object.
+
+        Args:
+            file_item: File metadata dictionary containing '@microsoft.graph.downloadUrl'
+
+        Returns:
+            BytesIO object containing the file contents
+
+        Raises:
+            FileNotFoundError: If the file cannot be downloaded
+        """
         file_url = file_item["@microsoft.graph.downloadUrl"]
         response = self.client.get(file_url)
         if response.status_code == 200:
diff --git a/sources/sharepoint/requirements.txt b/sources/sharepoint/requirements.txt
new file mode 100644
index 000000000..e7fbee22e
--- /dev/null
+++ b/sources/sharepoint/requirements.txt
@@ -0,0 +1,3 @@
+msal>=1.20.0
+pandas>=2.0.0
+dlt>=0.5.1
diff --git a/sources/sharepoint/sharepoint_files_config.py b/sources/sharepoint/sharepoint_files_config.py
index e1bc5a821..45ef8934c 100644
--- a/sources/sharepoint/sharepoint_files_config.py
+++ b/sources/sharepoint/sharepoint_files_config.py
@@ -1,3 +1,8 @@
+"""Configuration classes for SharePoint data extraction.
+
+Provides configuration models for SharePoint lists and files,
+including file type definitions and validation utilities.
+"""
 from typing import Optional, Dict
 import re
 from enum import Enum
@@ -7,6 +12,19 @@
 
 
 class FileType(Enum):
+    """Supported file types for SharePoint file extraction.
+
+    Each file type maps to a corresponding pandas read function.
+
+    Attributes:
+        EXCEL: Excel files (.xlsx, .xls)
+        CSV: Comma-separated values files
+        JSON: JSON format files
+        PARQUET: Apache Parquet files
+        SAS: SAS data files
+        SPSS: SPSS data files
+        SAV: SPSS SAV format files
+    """
     EXCEL = "excel"
     CSV = "csv"
     JSON = "json"
@@ -16,6 +34,11 @@ class FileType(Enum):
     SAV = "sav"
 
     def get_pd_function(self):
+        """Get the pandas read function for this file type.
+
+        Returns:
+            Callable pandas read function (e.g., pd.read_csv, pd.read_excel)
+        """
         return {
             self.EXCEL: pd.read_excel,
             self.CSV: pd.read_csv,
@@ -27,6 +50,17 @@ def get_pd_function(self):
 
 
 class SharepointListConfig(BaseModel):
+    """Configuration for SharePoint list data extraction.
+
+    Attributes:
+        table_name: Name of the destination table for the list data
+        list_title: Display name of the SharePoint list to extract
+        select: Optional comma-separated field names to retrieve
+        is_incremental: Enable incremental loading (not yet implemented)
+
+    Raises:
+        NotImplementedError: If is_incremental is set to True
+    """
     table_name: str
     list_title: str
     select: Optional[str] = None
@@ -41,6 +75,21 @@ def __init__(self, **data):
 
 
 class SharepointFilesConfig(BaseModel):
+    """Configuration for SharePoint file extraction and processing.
+
+    Attributes:
+        file_type: Type of files to process (CSV, Excel, etc.)
+        folder_path: Path to the SharePoint folder containing files
+        table_name: Name of the destination table for file data
+        file_name_startswith: Prefix filter for file names
+        pattern: Optional regex pattern for additional file filtering
+        pandas_kwargs: Additional arguments to pass to pandas read function
+        is_file_incremental: Enable incremental loading based on file modification time
+
+    Note:
+        The pattern attribute is automatically prefixed with file_name_startswith.
+        Folder paths are validated and normalized during initialization.
+    """
     file_type: FileType
     folder_path: str
     table_name: str
@@ -56,6 +105,20 @@ def __init__(self, **data):
 
 
 def validate_folder_path(folder_path: str) -> str:
+    """Validate and normalize a SharePoint folder path.
+
+    Removes leading/trailing slashes and validates that the path contains
+    only allowed characters (alphanumeric, dashes, underscores, spaces, dots).
+
+    Args:
+        folder_path: The folder path to validate
+
+    Returns:
+        Normalized folder path without leading/trailing slashes
+
+    Raises:
+        ValueError: If path contains invalid characters or double slashes
+    """
     if folder_path.startswith("/"):
         folder_path = folder_path[1:]
     if folder_path.endswith("/"):
diff --git a/tests/sharepoint/__init__.py b/tests/sharepoint/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/sharepoint/test_sharepoint_source.py b/tests/sharepoint/test_sharepoint_source.py
new file mode 100644
index 000000000..b40782a08
--- /dev/null
+++ b/tests/sharepoint/test_sharepoint_source.py
@@ -0,0 +1,651 @@
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from io import BytesIO
+import pandas as pd
+from typing import Dict, List
+
+import dlt
+from sources.sharepoint import (
+    sharepoint_list,
+    sharepoint_files,
+    SharepointCredentials,
+)
+from sources.sharepoint.sharepoint_files_config import (
+    SharepointFilesConfig,
+    SharepointListConfig,
+    FileType,
+    validate_folder_path,
+)
+from sources.sharepoint.helpers import SharepointClient
+
+from tests.utils import (
+    ALL_DESTINATIONS,
+    assert_load_info,
+    load_table_counts,
+)
+
+
+# Mock credentials for testing
+MOCK_CREDENTIALS = {
+    "client_id": "test_client_id",
+    "tenant_id": "test_tenant_id",
+    "site_id": "test_site_id",
+    "client_secret": "test_client_secret",
+    "sub_site_id": "",
+}
+
+
+class TestSharepointFilesConfig:
+    """Test SharepointFilesConfig class"""
+
+    def test_valid_config(self):
+        """Test creating a valid SharepointFilesConfig"""
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="/Documents/Reports",
+            table_name="test_table",
+            file_name_startswith="report",
+            pattern=r".*\.csv$",
+            pandas_kwargs={"sep": ","},
+            is_file_incremental=True,
+        )
+        assert config.file_type == FileType.CSV
+        assert config.folder_path == "Documents/Reports"
+        assert config.table_name == "test_table"
+        assert config.file_name_startswith == "report"
+        assert config.pattern == r"^report.*\.csv$"
+        assert config.pandas_kwargs == {"sep": ","}
+        assert config.is_file_incremental is True
+
+    def test_folder_path_normalization(self):
+        """Test that folder paths are normalized correctly"""
+        config = SharepointFilesConfig(
+            file_type=FileType.EXCEL,
+            folder_path="/Documents/",
+            table_name="test_table",
+            file_name_startswith="file",
+        )
+        assert config.folder_path == "Documents"
+
+    def test_pattern_prefix(self):
+        """Test that pattern is prefixed with file_name_startswith"""
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="Documents",
+            table_name="test_table",
+            file_name_startswith="report_",
+            pattern=r"\d{8}\.csv$",
+        )
+        assert config.pattern == r"^report_\d{8}\.csv$"
+
+    def test_get_pd_function(self):
+        """Test that get_pd_function returns correct pandas functions"""
+        assert FileType.CSV.get_pd_function() == pd.read_csv
+        assert FileType.EXCEL.get_pd_function() == pd.read_excel
+        assert FileType.JSON.get_pd_function() == pd.read_json
+        assert FileType.PARQUET.get_pd_function() == pd.read_parquet
+
+
+class TestSharepointListConfig:
+    """Test SharepointListConfig class"""
+
+    def test_valid_config(self):
+        """Test creating a valid SharepointListConfig"""
+        config = SharepointListConfig(
+            table_name="test_table",
+            list_title="Test List",
+            select="field1,field2",
+            is_incremental=False,
+        )
+        assert config.table_name == "test_table"
+        assert config.list_title == "Test List"
+        assert config.select == "field1,field2"
+        assert config.is_incremental is False
+
+    def test_incremental_not_implemented(self):
+        """Test that incremental loading raises NotImplementedError"""
+        with pytest.raises(NotImplementedError):
+            SharepointListConfig(
+                table_name="test_table",
+                list_title="Test List",
+                is_incremental=True,
+            )
+
+
+class TestValidateFolderPath:
+    """Test validate_folder_path function"""
+
+    def test_remove_leading_slash(self):
+        """Test that leading slashes are removed"""
+        assert validate_folder_path("/Documents") == "Documents"
+
+    def test_remove_trailing_slash(self):
+        """Test that trailing slashes are removed"""
+        assert validate_folder_path("Documents/") == "Documents"
+
+    def test_remove_both_slashes(self):
+        """Test that both leading and trailing slashes are removed"""
+        assert validate_folder_path("/Documents/") == "Documents"
+
+    def test_valid_path_with_subdirs(self):
+        """Test valid path with subdirectories"""
+        assert validate_folder_path("Documents/Reports/2024") == "Documents/Reports/2024"
+
+    def test_valid_path_with_spaces(self):
+        """Test valid path with spaces"""
+        assert validate_folder_path("My Documents/My Reports") == "My Documents/My Reports"
+
+    def test_invalid_characters(self):
+        """Test that invalid characters raise ValueError"""
+        with pytest.raises(ValueError, match="Invalid folder path"):
+            validate_folder_path("Documents/Reports@2024")
+
+    def test_double_slashes(self):
+        """Test that double slashes raise ValueError"""
+        with pytest.raises(ValueError, match="Invalid folder path with double slashes"):
+            validate_folder_path("Documents//Reports")
+
+
+class TestSharepointClient:
+    """Test SharepointClient class"""
+
+    def test_client_initialization(self):
+        """Test SharepointClient initialization"""
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        assert client.client_id == MOCK_CREDENTIALS["client_id"]
+        assert client.tenant_id == MOCK_CREDENTIALS["tenant_id"]
+        assert client.site_id == MOCK_CREDENTIALS["site_id"]
+        assert client.client_secret == MOCK_CREDENTIALS["client_secret"]
+        assert client.sub_site_id == ""
+        assert client.graph_site_url == f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}"
+
+    def test_client_with_subsite(self):
+        """Test SharepointClient initialization with sub_site_id"""
+        credentials = MOCK_CREDENTIALS.copy()
+        credentials["sub_site_id"] = "sub_site_123"
+        client = SharepointClient(**credentials)
+        expected_url = f"https://graph.microsoft.com/v1.0/sites/{MOCK_CREDENTIALS['site_id']}/sites/sub_site_123"
+        assert client.graph_site_url == expected_url
+
+    def test_client_missing_credentials(self):
+        """Test that missing credentials raise ValueError"""
+        with pytest.raises(ValueError, match="client_id, tenant_id, client_secret and site_id are required"):
+            SharepointClient(
+                client_id="",
+                tenant_id="test",
+                site_id="test",
+                client_secret="test",
+            )
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_connect_success(self, mock_rest_client, mock_msal_app):
+        """Test successful connection to SharePoint"""
+        # Mock MSAL token response
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {
+            "access_token": "test_access_token"
+        }
+        mock_msal_app.return_value = mock_app_instance
+
+        # Mock REST client
+        mock_client_instance = Mock()
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+
+        assert client.client is not None
+        mock_msal_app.assert_called_once()
+        mock_rest_client.assert_called_once()
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    def test_connect_failure(self, mock_msal_app):
+        """Test failed connection to SharePoint"""
+        # Mock MSAL token response without access_token
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"error": "authentication_failed"}
+        mock_msal_app.return_value = mock_app_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        with pytest.raises(ConnectionError, match="Connection failed"):
+            client.connect()
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_get_all_lists_in_site(self, mock_rest_client, mock_msal_app):
+        """Test getting all lists from a site"""
+        # Setup mocks
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"}
+        mock_msal_app.return_value = mock_app_instance
+
+        mock_client_instance = Mock()
+        mock_response = Mock()
+        mock_response.json.return_value = {
+            "value": [
+                {
+                    "id": "list1",
+                    "displayName": "Test List 1",
+                    "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList1",
+                    "list": {"template": "genericList"},
+                },
+                {
+                    "id": "list2",
+                    "displayName": "Test List 2",
+                    "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList2",
+                    "list": {"template": "genericList"},
+                },
+                {
+                    "id": "list3",
+                    "displayName": "Document Library",
+                    "webUrl": "https://test.sharepoint.com/sites/test/Shared Documents",
+                    "list": {"template": "documentLibrary"},
+                },
+            ]
+        }
+        mock_response.raise_for_status = Mock()
+        mock_client_instance.get.return_value = mock_response
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+        lists = client.get_all_lists_in_site()
+
+        assert len(lists) == 2
+        assert all(item["list"]["template"] == "genericList" for item in lists)
+        assert all("Lists" in item["webUrl"] for item in lists)
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_get_items_from_list(self, mock_rest_client, mock_msal_app):
+        """Test getting items from a specific list"""
+        # Setup mocks
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"}
+        mock_msal_app.return_value = mock_app_instance
+
+        mock_client_instance = Mock()
+
+        # Mock response for get_all_lists_in_site
+        mock_lists_response = Mock()
+        mock_lists_response.json.return_value = {
+            "value": [
+                {
+                    "id": "list1",
+                    "displayName": "Test List",
+                    "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList",
+                    "list": {"template": "genericList"},
+                }
+            ]
+        }
+        mock_lists_response.raise_for_status = Mock()
+
+        # Mock response for list items
+        mock_items_response = Mock()
+        mock_items_response.json.return_value = {
+            "value": [
+                {"fields": {"Title": "Item 1", "Description": "Test item 1"}},
+                {"fields": {"Title": "Item 2", "Description": "Test item 2"}},
+            ]
+        }
+        mock_items_response.raise_for_status = Mock()
+
+        mock_client_instance.get.side_effect = [mock_lists_response, mock_items_response]
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+        items = client.get_items_from_list("Test List")
+
+        assert len(items) == 2
+        assert items[0]["Title"] == "Item 1"
+        assert items[1]["Title"] == "Item 2"
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_get_items_from_nonexistent_list(self, mock_rest_client, mock_msal_app):
+        """Test getting items from a list that doesn't exist"""
+        # Setup mocks
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"}
+        mock_msal_app.return_value = mock_app_instance
+
+        mock_client_instance = Mock()
+        mock_response = Mock()
+        mock_response.json.return_value = {
+            "value": [
+                {
+                    "id": "list1",
+                    "displayName": "Test List",
+                    "webUrl": "https://test.sharepoint.com/sites/test/Lists/TestList",
+                    "list": {"template": "genericList"},
+                }
+            ]
+        }
+        mock_response.raise_for_status = Mock()
+        mock_client_instance.get.return_value = mock_response
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+
+        with pytest.raises(ValueError, match="List with title 'Nonexistent List' not found"):
+            client.get_items_from_list("Nonexistent List")
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_get_files_from_path(self, mock_rest_client, mock_msal_app):
+        """Test getting files from a folder path"""
+        # Setup mocks
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"}
+        mock_msal_app.return_value = mock_app_instance
+
+        mock_client_instance = Mock()
+        mock_response = Mock()
+        mock_response.json.return_value = {
+            "value": [
+                {
+                    "name": "report_2024.csv",
+                    "file": {},
+                    "lastModifiedDateTime": "2024-01-01T00:00:00Z",
+                },
+                {
+                    "name": "report_2023.csv",
+                    "file": {},
+                    "lastModifiedDateTime": "2023-01-01T00:00:00Z",
+                },
+                {
+                    "name": "subfolder",
+                    "folder": {},
+                },
+            ]
+        }
+        mock_client_instance.get.return_value = mock_response
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+        files = client.get_files_from_path("Documents", "report", pattern=r".*2024.*")
+
+        assert len(files) == 1
+        assert files[0]["name"] == "report_2024.csv"
+
+    @patch("sources.sharepoint.helpers.ConfidentialClientApplication")
+    @patch("sources.sharepoint.helpers.RESTClient")
+    def test_get_file_bytes_io(self, mock_rest_client, mock_msal_app):
+        """Test downloading a file to BytesIO"""
+        # Setup mocks
+        mock_app_instance = Mock()
+        mock_app_instance.acquire_token_for_client.return_value = {"access_token": "test_token"}
+        mock_msal_app.return_value = mock_app_instance
+
+        mock_client_instance = Mock()
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.content = b"test file content"
+        mock_client_instance.get.return_value = mock_response
+        mock_rest_client.return_value = mock_client_instance
+
+        client = SharepointClient(**MOCK_CREDENTIALS)
+        client.connect()
+
+        file_item = {
+            "name": "test.csv",
+            "@microsoft.graph.downloadUrl": "https://test.sharepoint.com/download/test.csv",
+        }
+        bytes_io = client.get_file_bytes_io(file_item)
+
+        assert isinstance(bytes_io, BytesIO)
+        assert bytes_io.getvalue() == b"test file content"
+
+
+class TestSharepointListSource:
+    """Test sharepoint_list source"""
+
+    @patch("sources.sharepoint.SharepointClient")
+    def test_sharepoint_list_source(self, mock_client_class):
+        """Test sharepoint_list source yields data correctly"""
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site"}
+        mock_client_instance.get_items_from_list.return_value = [
+            {"Title": "Item 1", "Field1": "Value1"},
+            {"Title": "Item 2", "Field1": "Value2"},
+        ]
+        mock_client_class.return_value = mock_client_instance
+
+        # Create config
+        config = SharepointListConfig(
+            table_name="test_table",
+            list_title="Test List",
+            select="Title,Field1",
+        )
+
+        # Create source
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_list(config, credentials=credentials)
+
+        # Extract data
+        resources = list(source)
+        assert len(resources) == 1
+
+        # Get data from resource
+        resource_data = list(resources[0])
+        assert len(resource_data) == 2
+        assert resource_data[0]["Title"] == "Item 1"
+        assert resource_data[1]["Title"] == "Item 2"
+
+
+class TestSharepointFilesSource:
+    """Test sharepoint_files source"""
+
+    @patch("sources.sharepoint.SharepointClient")
+    def test_sharepoint_files_source_csv(self, mock_client_class):
+        """Test sharepoint_files source with CSV files"""
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site"}
+        mock_client_instance.get_files_from_path.return_value = [
+            {
+                "name": "report.csv",
+                "lastModifiedDateTime": "2024-01-01T00:00:00Z",
+                "@microsoft.graph.downloadUrl": "https://test.com/report.csv",
+            }
+        ]
+
+        # Create test CSV data
+        csv_data = b"col1,col2\nval1,val2\nval3,val4"
+        mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data)
+        mock_client_class.return_value = mock_client_instance
+
+        # Create config
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="Documents",
+            table_name="test_table",
+            file_name_startswith="report",
+        )
+
+        # Create source
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_files(config, credentials=credentials)
+
+        # Extract data
+        resources = list(source)
+        assert len(resources) == 1
+
+        # Get data from resource - this should yield file items first, then dataframes
+        all_data = list(resources[0])
+
+        # The transformer should yield dataframes
+        assert len(all_data) > 0
+
+    @patch("sources.sharepoint.SharepointClient")
+    def test_sharepoint_files_source_incremental(self, mock_client_class):
+        """Test sharepoint_files source with incremental loading"""
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site"}
+        mock_client_instance.get_files_from_path.return_value = [
+            {
+                "name": "old_file.csv",
+                "lastModifiedDateTime": "2020-01-01T00:00:00Z",
+                "@microsoft.graph.downloadUrl": "https://test.com/old_file.csv",
+            },
+            {
+                "name": "new_file.csv",
+                "lastModifiedDateTime": "2024-01-01T00:00:00Z",
+                "@microsoft.graph.downloadUrl": "https://test.com/new_file.csv",
+            },
+        ]
+
+        csv_data = b"col1,col2\nval1,val2"
+        mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data)
+        mock_client_class.return_value = mock_client_instance
+
+        # Create config with incremental loading
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="Documents",
+            table_name="test_table",
+            file_name_startswith="file",
+            is_file_incremental=True,
+        )
+
+        # Create source
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_files(config, credentials=credentials)
+
+        # Extract data
+        resources = list(source)
+        assert len(resources) == 1
+
+    @patch("sources.sharepoint.SharepointClient")
+    def test_sharepoint_files_source_with_chunks(self, mock_client_class):
+        """Test sharepoint_files source with chunked reading"""
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site"}
+        mock_client_instance.get_files_from_path.return_value = [
+            {
+                "name": "large_file.csv",
+                "lastModifiedDateTime": "2024-01-01T00:00:00Z",
+                "@microsoft.graph.downloadUrl": "https://test.com/large_file.csv",
+            }
+        ]
+
+        # Create larger CSV data
+        csv_data = b"col1,col2\n" + b"\n".join([b"val1,val2" for _ in range(100)])
+        mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data)
+        mock_client_class.return_value = mock_client_instance
+
+        # Create config with chunksize
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="Documents",
+            table_name="test_table",
+            file_name_startswith="large",
+            pandas_kwargs={"chunksize": 10},
+        )
+
+        # Create source
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_files(config, credentials=credentials)
+
+        # Extract data
+        resources = list(source)
+        assert len(resources) == 1
+
+
+@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS)
+def test_sharepoint_list_pipeline(destination_name: str) -> None:
+    """Integration test for sharepoint_list pipeline"""
+
+    with patch("sources.sharepoint.SharepointClient") as mock_client_class:
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"}
+        mock_client_instance.get_items_from_list.return_value = [
+            {"Title": "Item 1", "Description": "Description 1", "Status": "Active"},
+            {"Title": "Item 2", "Description": "Description 2", "Status": "Completed"},
+            {"Title": "Item 3", "Description": "Description 3", "Status": "Active"},
+        ]
+        mock_client_class.return_value = mock_client_instance
+
+        # Create pipeline
+        pipeline = dlt.pipeline(
+            pipeline_name="test_sharepoint_list",
+            destination=destination_name,
+            dataset_name="sharepoint_list_test",
+            dev_mode=True,
+        )
+
+        # Create config
+        config = SharepointListConfig(
+            table_name="test_items",
+            list_title="Test List",
+        )
+
+        # Create source and run pipeline
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_list(config, credentials=credentials)
+        load_info = pipeline.run(source)
+
+        # Assert load info
+        assert_load_info(load_info)
+
+        # Check table counts
+        table_counts = load_table_counts(pipeline, "test_items")
+        assert table_counts["test_items"] == 3
+
+
+@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS)
+def test_sharepoint_files_pipeline(destination_name: str) -> None:
+    """Integration test for sharepoint_files pipeline"""
+
+    with patch("sources.sharepoint.SharepointClient") as mock_client_class:
+        # Setup mock client
+        mock_client_instance = Mock()
+        mock_client_instance.site_info = {"id": "test_site", "displayName": "Test Site"}
+        mock_client_instance.get_files_from_path.return_value = [
+            {
+                "name": "data.csv",
+                "lastModifiedDateTime": "2024-01-01T00:00:00Z",
+                "@microsoft.graph.downloadUrl": "https://test.com/data.csv",
+            }
+        ]
+
+        # Create test CSV data
+        csv_data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,Chicago"
+        mock_client_instance.get_file_bytes_io.return_value = BytesIO(csv_data)
+        mock_client_class.return_value = mock_client_instance
+
+        # Create pipeline
+        pipeline = dlt.pipeline(
+            pipeline_name="test_sharepoint_files",
+            destination=destination_name,
+            dataset_name="sharepoint_files_test",
+            dev_mode=True,
+        )
+
+        # Create config
+        config = SharepointFilesConfig(
+            file_type=FileType.CSV,
+            folder_path="Documents",
+            table_name="test_data",
+            file_name_startswith="data",
+        )
+
+        # Create source and run pipeline
+        credentials = SharepointCredentials(**MOCK_CREDENTIALS)
+        source = sharepoint_files(config, credentials=credentials)
+        load_info = pipeline.run(source)
+
+        # Assert load info
+        assert_load_info(load_info)
+
+        # Check table counts
+        table_counts = load_table_counts(pipeline, "test_data")
+        assert table_counts["test_data"] == 3

From 0a76475fe164f0123bb0ba07e76059f6bc60ad97 Mon Sep 17 00:00:00 2001
From: sd41847 <sd41847@inetpsa.com>
Date: Mon, 8 Dec 2025 15:28:30 +0100
Subject: [PATCH 9/9] refactor: Update resource extraction in
 SharepointListSource and SharepointFilesSource tests for clarity

---
 tests/sharepoint/test_sharepoint_source.py | 25 ++++++++++++----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/sharepoint/test_sharepoint_source.py b/tests/sharepoint/test_sharepoint_source.py
index b40782a08..492bdd456 100644
--- a/tests/sharepoint/test_sharepoint_source.py
+++ b/tests/sharepoint/test_sharepoint_source.py
@@ -427,8 +427,8 @@ def test_sharepoint_list_source(self, mock_client_class):
         credentials = SharepointCredentials(**MOCK_CREDENTIALS)
         source = sharepoint_list(config, credentials=credentials)
 
-        # Extract data
-        resources = list(source)
+        # Extract resources from source
+        resources = list(source.resources.values())
         assert len(resources) == 1
 
         # Get data from resource
@@ -436,8 +436,6 @@ def test_sharepoint_list_source(self, mock_client_class):
         assert len(resource_data) == 2
         assert resource_data[0]["Title"] == "Item 1"
         assert resource_data[1]["Title"] == "Item 2"
-
-
 class TestSharepointFilesSource:
     """Test sharepoint_files source"""
 
@@ -472,11 +470,11 @@ def test_sharepoint_files_source_csv(self, mock_client_class):
         credentials = SharepointCredentials(**MOCK_CREDENTIALS)
         source = sharepoint_files(config, credentials=credentials)
 
-        # Extract data
-        resources = list(source)
+        # Extract resources from source
+        resources = list(source.resources.values())
         assert len(resources) == 1
 
-        # Get data from resource - this should yield file items first, then dataframes
+        # Get data from resource - this should yield dataframes
         all_data = list(resources[0])
 
         # The transformer should yield dataframes
@@ -518,8 +516,8 @@ def test_sharepoint_files_source_incremental(self, mock_client_class):
         credentials = SharepointCredentials(**MOCK_CREDENTIALS)
         source = sharepoint_files(config, credentials=credentials)
 
-        # Extract data
-        resources = list(source)
+        # Extract resources from source
+        resources = list(source.resources.values())
         assert len(resources) == 1
 
     @patch("sources.sharepoint.SharepointClient")
@@ -554,10 +552,15 @@ def test_sharepoint_files_source_with_chunks(self, mock_client_class):
         credentials = SharepointCredentials(**MOCK_CREDENTIALS)
         source = sharepoint_files(config, credentials=credentials)
 
-        # Extract data
-        resources = list(source)
+        # Extract resources from source
+        resources = list(source.resources.values())
         assert len(resources) == 1
 
+        # Get data from resource - with chunksize, this yields multiple dataframes (chunks)
+        all_chunks = list(resources[0])
+        # Should have 10 chunks (100 rows / 10 rows per chunk)
+        assert len(all_chunks) == 10
+
 
 @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS)
 def test_sharepoint_list_pipeline(destination_name: str) -> None: