From 7196a114cc273150ae9c71d93d7470790130619f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ezequiel=20Leonardo=20Casta=C3=B1o?= <14986783+ELC@users.noreply.github.com> Date: Wed, 18 Jun 2025 22:37:07 -0300 Subject: [PATCH 1/4] Scraped pydata-amsterdam-2023 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #xxx Event config: ~~~yaml repo_dir: W:\Repositories\pyvideo-data # Copy the event template here and adapt to the event parameters # Only repo_dir: and events: are loaded # ============================================================================= events: # - title: PyData Virginia 2025 # dir: pydata-virginia-2025 # youtube_list: # - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qLS7Mk-jI9jhb4t5UY6yDW # related_urls: # - label: Conference Website # url: https://pydata.org/virginia2025 # language: eng # dates: # begin: 2025-04-18 # end: 2025-04-19 # default: 2025-04-18 # minimal_download: false # issue: xxx # overwrite: # # all: true # takes precedence over add_new_files and existing_files_fields # add_new_files: true # existing_files_fields: # - duration # - thumbnail_url # - videos # - description # - language # - recorded # - related_urls # - speakers # - tags # - title # tags: # - title: PyData Global 2024 # dir: pydata-global-2024 # youtube_list: # - https://www.youtube.com/playlist?list=PLGVZCDnMOq0otKlHvES9iBFtVQ71yZhed # related_urls: # - label: Conference Website # url: https://pydata.org/global2024 # language: eng # dates: # begin: 2024-12-03 # end: 2024-12-05 # default: 2024-12-03 # minimal_download: false # issue: xxx # overwrite: # # all: true # takes precedence over add_new_files and existing_files_fields # add_new_files: true # existing_files_fields: # - duration # - thumbnail_url # - videos # - description # - language # - recorded # - related_urls # - speakers # - tags # - title # tags: - title: PyData New York City 2024 dir: pydata-new-york-city-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0ohEIZ-_wM2W_xqSVjyA3dC related_urls: - label: Conference Website url: https://pydata.org/nyc2024 language: eng dates: begin: 2024-11-06 end: 2024-11-08 default: 2024-11-06 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Tel Aviv 2024 dir: pydata-tel-avid-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pRsGPxDvLZfuufNgqREc0a related_urls: - label: Conference Website url: https://pydata.org/telaviv2024/ language: eng dates: begin: 2024-11-04 end: 2024-11-04 default: 2024-11-04 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Paris 2024 dir: pydata-paris-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pKya8gksd00ennKuyoH7v7 related_urls: - label: Conference Website url: https://pydata.org/paris2024 language: eng dates: begin: 2024-09-25 end: 2024-09-26 default: 2024-09-25 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Amsterdam 2024 dir: pydata-amsterdam-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0reU2lzNZCn9obkyRVaSnpF related_urls: - label: Conference Website url: https://web.archive.org/web/20240822042916/https://amsterdam.pydata.org/ language: eng dates: begin: 2024-09-18 end: 2024-09-20 default: 2024-09-18 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Vermont 2024 dir: pydata-vermont-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pME_xSRdmoYFzhlsHJYM8I related_urls: - label: Conference Website url: https://pydata.org/vermont2024/ language: eng dates: begin: 2024-07-29 end: 2024-07-30 default: 2024-07-29 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Eindhoven 2024 dir: pydata-eindhoven-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0q7a2aoNP1au_1egfZEjGL6 related_urls: - label: Conference Website url: https://pydata.org/eindhoven2024/ language: eng dates: begin: 2024-07-11 end: 2024-07-11 default: 2024-07-11 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData London 2024 dir: pydata-london-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0rrhYTNedKKuJ9716fEaAdK related_urls: - label: Conference Website url: https://pydata.org/london2024/ language: eng dates: begin: 2024-06-14 end: 2024-06-16 default: 2024-06-14 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Berlin 2024 dir: pydata-berlin-2024 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0r2tGyr-hjbnCrjXRkCMvwB related_urls: - label: Conference Website url: https://2024.pycon.de/ language: eng dates: begin: 2024-06-14 end: 2024-06-16 default: 2024-06-14 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Global 2023 dir: pydata-global-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0poULd1C4oUdPbPkTe4poJx related_urls: - label: Conference Website url: https://pydata.org/global2023/ language: eng dates: begin: 2023-12-06 end: 2023-12-08 default: 2023-12-06 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Eindhoven 2023 dir: pydata-eindhoven-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qkbJjIfppGO44yhDV2i4gR related_urls: - label: Conference Website url: https://web.archive.org/web/20240930133013/http://pydata.org/eindhoven2023 language: eng dates: begin: 2023-11-30 end: 2023-11-30 default: 2023-11-30 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData New York City 2023 dir: pydata-new-york-city-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0o79mT1hHyqtFDSNzXXSYQM related_urls: - label: Conference Website url: https://pydata.org/nyc2023/ language: eng dates: begin: 2023-11-01 end: 2023-11-03 default: 2023-11-01 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Amsterdam 2023 dir: pydata-amsterdam-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pADyz2VboxPFIdrsozlENg related_urls: - label: Conference Website url: https://amsterdam2023.pydata.org/cfp/schedule/ language: eng dates: begin: 2023-09-14 end: 2023-09-16 default: 2023-09-14 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Seattle 2023 dir: pydata-seattle-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0q81_-rt5jzJ--ZEgcNArKb related_urls: - label: Conference Website url: https://pydata.org/seattle2023/ language: eng dates: begin: 2023-04-26 end: 2023-04-28 default: 2023-04-26 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Berlin 2023 dir: pydata-berlin-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0peDguAzds7kVmBr8avp46K related_urls: - label: Conference Website url: https://2023.pycon.de/ language: eng dates: begin: 2023-04-17 end: 2023-04-19 default: 2023-04-17 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Yerevan 2023 dir: pydata-yerevan-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pJKftCB2BtalTDE-2xS20g language: eng dates: begin: 2023-10-23 end: 2024-11-07 default: 2023-10-23 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Trójmiasto 2023 dir: pydata-trojmiasto-2023 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qS0mI7s9tpXnS-XV5l_Ibs related_urls: - label: Conference Website url: https://www.meetup.com/pl-PL/pydata-trojmiasto/ language: eng dates: begin: 2023-10-24 end: 2023-10-24 default: 2023-10-24 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Tel Avid 2022 dir: pydata-tel-avid-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0p6o_fjjdNPqy1rps49z2S0 related_urls: - label: Conference Website url: https://pydata.org/telaviv2022/ language: eng dates: begin: 2022-12-13 end: 2022-12-13 default: 2022-12-13 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Eindhoven 2022 dir: pydata-eindhoven-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0pI60MsrFpHcII1qWm7drmZ related_urls: - label: Conference Website url: https://pydata.org/eindhoven2022/ language: eng dates: begin: 2022-12-02 end: 2022-12-02 default: 2022-12-02 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Global 2022 dir: pydata-global-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qgYUt0yn7F80wmzCnj2dEq related_urls: - label: Conference Website url: https://pydata.org/global2022/ language: eng dates: begin: 2022-12-01 end: 2022-12-03 default: 2022-12-01 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData New York City 2022 dir: pydata-new-york-city-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0opPc5-dp6ZDCFvOqDBlUuv related_urls: - label: Conference Website url: https://pydata.org/nyc2022/ language: eng dates: begin: 2022-11-09 end: 2022-11-11 default: 2022-11-09 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Yerevan 2022 dir: pydata-yerevan-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qWwVVDmdOw6oxAlqqH8Ca- related_urls: - label: Conference Website url: https://pydata.org/yerevan2022/ language: eng dates: begin: 2022-08-12 end: 2022-08-13 default: 2022-08-12 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData London 2022 dir: pydata-london-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qT0MXnci7VBSF-U-0WaQ-w related_urls: - label: Conference Website url: https://pydata.org/london2022/ language: eng dates: begin: 2022-06-17 end: 2022-06-19 default: 2022-06-17 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Berlin 2022 dir: pydata-berlin-2022 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0p0Fal8_YKg6fPXnf3iPtwD related_urls: - label: Conference Website url: https://2022.pycon.de/ language: eng dates: begin: 2022-04-11 end: 2022-04-13 default: 2022-04-11 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Global 2021 dir: pydata-global-2021 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0rHb3JXG6puQnUAclFFZMlh related_urls: - label: Conference Website url: https://pydata.org/global2021/ language: eng dates: begin: 2021-10-28 end: 2021-10-30 default: 2021-10-28 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Eindhoven 2021 dir: pydata-eindhoven-2021 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0rBKcoKoaWJiMrDGdNr2_S0 related_urls: - label: Conference Website url: https://pydata.org/eindhoven2021/ language: eng dates: begin: 2021-11-12 end: 2021-11-12 default: 2021-11-12 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Global 2020 dir: pydata-global-2020 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0r0eC9BnITmYJ786p9Y1Q8D related_urls: - label: Conference Website url: https://pydataglobal.github.io/ language: eng dates: begin: 2020-11-11 end: 2020-11-15 default: 2020-11-11 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Eindhoven 2020 dir: pydata-eindhoven-2020 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qpKjuGgNOgtOxIuATvnqEr related_urls: - label: Conference Website url: https://pydata.org/eindhoven2020/schedule/ language: eng dates: begin: 2020-10-07 end: 2020-10-09 default: 2020-10-07 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Amsterdam 2020 dir: pydata-amsterdam-2020 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0oX4ymLgldSvpfiZj-S8-fH related_urls: - label: Conference Website url: https://datasciencedistrict.nl/pydata-festival-amsterda/ language: eng dates: begin: 2020-06-15 end: 2020-06-20 default: 2020-06-15 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData South Africa 2018 dir: pydata-south-africa-2018 youtube_list: - https://www.youtube.com/watch?v=Lvw3Lp3KrTM&list=PLGjWYNrNnSuc78h5x23A5mLAzWlCl9LGf related_urls: - label: Conference Website url: https://2018.za.pycon.org/ language: eng dates: begin: 2018-10-11 end: 2018-10-12 default: 2018-10-11 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: - title: PyData Hamburg 2021 dir: pydata-hamburg-2021 youtube_list: - https://www.youtube.com/playlist?list=PLGVZCDnMOq0qbRG8gBRkosFfhWrObasQF related_urls: - label: Conference Website url: https://www.meetup.com/pydata-hamburg/ language: eng dates: begin: 2020-11-03 end: 2021-03-03 default: 2021-03-03 minimal_download: false issue: xxx overwrite: # all: true # takes precedence over add_new_files and existing_files_fields add_new_files: true existing_files_fields: - duration - thumbnail_url - videos - description - language - recorded - related_urls - speakers - tags - title tags: # ISO_639-3 language codes https://en.wikipedia.org/wiki/ISO_639-3 # languages = { # 'ita': 'Italian', # 'zho': 'Chinese', # 'por': 'Portuguese', # 'ukr': 'Ukrainian', # 'deu': 'German', # 'eng': 'English', # 'rus': 'Russian', # 'fra': 'French', # 'spa': 'Spanish', # 'eus': 'Basque', # 'cat': 'Catalan', # 'glg': 'Galician', # 'kor': 'Korean', # 'lit': 'Lithuanian', # 'jpn': 'Japanese', # 'ces': 'Czech', # 'pol': 'Polish', # 'heb': 'Hebrew', # 'tha': 'Thai', # } ~~~ Scraped with [pyvideo_scrape](https://github.com/pyvideo/pyvideo_scrape) --- pydata-amsterdam-2023/category.json | 3 ++ ...s-to-the-rescue-pydata-amsterdam-2023.json | 44 +++++++++++++++++++ ...-and-everything-in-between-pdams-2023.json | 28 ++++++++++++ ...thon-open-source-ecosystem-pdams-2023.json | 28 ++++++++++++ ...oding-and-when-to-use-what-pdams-2023.json | 28 ++++++++++++ ...ian-tournaments-pydata-amsterdam-2023.json | 28 ++++++++++++ ...face-and-skorch-pydata-amsterdam-2023.json | 32 ++++++++++++++ ...lue-chain-as-a-product-data-scientist.json | 28 ++++++++++++ ...-with-duckdb-and-arrowflight-pdams-23.json | 28 ++++++++++++ ...boosting-models-pydata-amsterdam-2023.json | 28 ++++++++++++ ...sis-a-deep-dive-pydata-amsterdam-2023.json | 28 ++++++++++++ ...s-of-data-testing-hell-still-relevant.json | 32 ++++++++++++++ ...m-in-production-pydata-amsterdam-2023.json | 28 ++++++++++++ ...r-machine-learning-model-optimization.json | 28 ++++++++++++ ...abyte-data-lake-pydata-amsterdam-2023.json | 28 ++++++++++++ ...mpaigns-under-uncertainty-with-pystan.json | 40 +++++++++++++++++ ...players-in-pymc-pydata-amsterdam-2023.json | 28 ++++++++++++ ...-computer-vision-pipelines-pdams-2023.json | 28 ++++++++++++ ...transfer-with-compact-neural-networks.json | 24 ++++++++++ ...s-forecasting-in-the-renewable-energy.json | 28 ++++++++++++ ...e-language-models-tools-and-knowledge.json | 28 ++++++++++++ ...e-way-of-eating-pydata-amsterdam-2023.json | 28 ++++++++++++ ...x-max-in-pandas-pydata-amsterdam-2023.json | 28 ++++++++++++ ...ess-to-my-to-do-list-pydata-amsterdam.json | 28 ++++++++++++ ...n-wasn-t-needed-pydata-amsterdam-2023.json | 28 ++++++++++++ ...ithout-dystopia-pydata-amsterdam-2023.json | 28 ++++++++++++ ...i-d-like-them-to-do-pd-amsterdam-2023.json | 28 ++++++++++++ ...rated-learning-with-flower-pdams-2023.json | 28 ++++++++++++ ...mmers-ok-doomer-pydata-amsterdam-2023.json | 28 ++++++++++++ ...ng-with-distance-functions-pdams-2023.json | 28 ++++++++++++ ...man-in-the-loop-pydata-amsterdam-2023.json | 32 ++++++++++++++ ...p-monitor-wildlife-in-parks-in-africa.json | 28 ++++++++++++ ...ation-impact-and-detection-pdams-2023.json | 28 ++++++++++++ ...roach-with-insights-from-the-industry.json | 28 ++++++++++++ ...ent-with-duckdb-pydata-amsterdam-2023.json | 28 ++++++++++++ ...of-news-readers-pydata-amsterdam-2023.json | 28 ++++++++++++ ...al-driven-machine-learning-pdams-2023.json | 28 ++++++++++++ .../pydata-amsterdam-2023-opening-notes.json | 28 ++++++++++++ ...odel-deployment-pydata-amsterdam-2023.json | 28 ++++++++++++ ...ne-with-dagster-pydata-amsterdam-2023.json | 28 ++++++++++++ ...i-level-forecasting-models-pdams-2023.json | 28 ++++++++++++ ...ess-and-pressure-as-a-data-specialist.json | 28 ++++++++++++ ...pression-engine-pydata-amsterdam-2023.json | 28 ++++++++++++ ...ial-intelligence-in-cultural-heritage.json | 28 ++++++++++++ ...-clusters-using-kubernetes-pdams-2023.json | 28 ++++++++++++ ...ghts-from-the-music-industry-pdams-23.json | 28 ++++++++++++ ...n-your-screen-to-a-meal-on-your-plate.json | 28 ++++++++++++ ...tainable-etl-workflows-at-booking-com.json | 28 ++++++++++++ ...ions-in-no-time-with-taipy-pdams-2023.json | 28 ++++++++++++ ...dam-greener-safer-and-more-accessible.json | 28 ++++++++++++ ...-pandas-changes-pydata-amsterdam-2023.json | 32 ++++++++++++++ ...-context-window-pydata-amsterdam-2023.json | 28 ++++++++++++ ...tural-intelligence-is-all-you-need-tm.json | 28 ++++++++++++ ...-llm-created-datasets-to-train-models.json | 28 ++++++++++++ ...fraud-detection-pydata-amsterdam-2023.json | 28 ++++++++++++ 55 files changed, 1555 insertions(+) create mode 100644 pydata-amsterdam-2023/category.json create mode 100644 pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json create mode 100644 pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json create mode 100644 pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json create mode 100644 pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json create mode 100644 pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json create mode 100644 pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json create mode 100644 pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json create mode 100644 pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json create mode 100644 pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json create mode 100644 pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json create mode 100644 pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json create mode 100644 pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json create mode 100644 pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json create mode 100644 pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json create mode 100644 pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json create mode 100644 pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json create mode 100644 pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json create mode 100644 pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json create mode 100644 pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json create mode 100644 pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json create mode 100644 pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json create mode 100644 pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json create mode 100644 pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json diff --git a/pydata-amsterdam-2023/category.json b/pydata-amsterdam-2023/category.json new file mode 100644 index 000000000..d7291e7e4 --- /dev/null +++ b/pydata-amsterdam-2023/category.json @@ -0,0 +1,3 @@ +{ + "title": "PyData Amsterdam 2023" +} diff --git a/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json new file mode 100644 index 000000000..71987e1dd --- /dev/null +++ b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json @@ -0,0 +1,44 @@ +{ + "description": "Pickle files can be evil and simply loading them can run arbitrary code on your system. This talk presents why that is, how it can be exploited, and how skops is tackling the issue for scikit-learn/statistical ML models. We go through some lower level pickle related machinery, and go in detail how the new format works.\n\nThe pickle format has many vulnerabilities and loading them alone can run arbitrary code on the user\u2019s system [1]. In this session we go through the process used by the pickle module to persist python objects, while demonstrating how they can be exploited. We go through how __getstate__ and __setstate__ are used, and how the output of a __reduce__ method is used to reconstruct an object, and how one can have a malicious implementation of these methods to create a malicious pickle file without knowing how to manually create a pickle file by manipulating a file on a lower level. We also briefly touch on other known exploits and issues related to the format [2].\n\nWe also show how one can look inside a pickle file and the operations run by it while loading it, and how one could get an equivalent python script which would result in the output of the pickle file [3]\nThen I present an alternative format from the skops library [4] which can be used to store scikit-learn based models. We talk about what the format is, and how persistence and loading is done, and what we do to prevent loading malicious objects or to avoid running arbitrary code. This format can be used to store almost any scikit-learn estimator, as well as xgboost, lightgbm, and catboost models.\n\n[1] https://peps.python.org/pep-0307/#security-issues\n[2] https://github.com/moreati/pickle-fuzz\n[3] https://github.com/trailofbits/fickling\n[4] https://skops.readthedocs.io/en/stable/persistence.html\n\nBio:\nAdrin\nAdrin works on a few open source projects including skops which tackles some of the MLOps challenges related to scikit-learn models. He has a PhD in Bioinformatics, has worked as a consultant, and in an algorithmic privacy and fairness team. He's also a core developer of scikit-learn and fairlearn.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1339, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + }, + { + "label": "https://github.com/trailofbits/fickling", + "url": "https://github.com/trailofbits/fickling" + }, + { + "label": "https://github.com/moreati/pickle-fuzz", + "url": "https://github.com/moreati/pickle-fuzz" + }, + { + "label": "https://skops.readthedocs.io/en/stable/persistence.html", + "url": "https://skops.readthedocs.io/en/stable/persistence.html" + }, + { + "label": "https://peps.python.org/pep-0307/#security-issues", + "url": "https://peps.python.org/pep-0307/#security-issues" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/9w_H5OSTO9A/maxresdefault.jpg", + "title": "Adrin - Let\u2019s exploit pickle, and `skops` to the rescue! | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=9w_H5OSTO9A" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json new file mode 100644 index 000000000..671a88fa9 --- /dev/null +++ b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy\n\nData scientists in industry often have to wear many hats. They must navigate statistical validity, business acumen and strategic thinking, while also representing the end user. In this talk, we will talk about the pillars that make a metric the right one for a job, and how to choose appropriate Key Performance Indicators (KPIs) to drive product success and strategic gains.\n\nOur presentation will traverse the relationship of data science skills in product strategy - embracing the multifaceted role of the data scientist and navigating the journey from user segmentation to making data-driven decisions.\n\nThe Data Scientist's Hat Trick: We initiate by emphasising the assorted roles that a data scientist plays in today's business landscape - from being a statistician ensuring the accuracy and validity of data to a strategist driving business decisions. [5 mins]\n\nChoosing Significant Metrics: Next, we'll delve into the nuances of selecting the right metric for the job. Specifically, we\u2019ll talk about the different pillars of metrics setting, for common data science responsibilities such as randomised controlled trials, offline evaluation, opportunity analysis etc. [7 mins]\n\nSetting The Right KPIs: Once metrics are defined, we'll venture into setting the correct KPIs - the small set of top line numbers that say if our venture is doing the job. [7 mins]\n\nData-Driven Decision Making: Lastly, we'll elucidate how to leverage the data you've gathered to make informed, strategic decisions. This necessitates interpreting your metrics and KPIs, spotting trends, and making necessary adjustments to stay on course. [7 mins]\n\nIncorporating real-world case studies, we'll demonstrate how these concepts intertwine to contribute to product success.\n\nLearning Objectives:\n* Appreciate the multifaceted role of a data scientist in driving product strategies.\n* Learn to set realistic and challenging KPIs that align with your company's overarching objectives.\n* Gain insights into leveraging data for informed decision-making and product strategy adjustments.\n\nBio:\nAlon Nir\nData scientist (Data Lead) at Spotify. Dismal scientist by education. Advocating against pie charts since 2015. Self-proclaimed GIF connoisseur.\n\nDror A. Guldin\nData Scientist (Tech Lead) at Meta\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1707, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/Yd35Q2oclY8/maxresdefault.jpg", + "title": "Alon Nir & Dror A. Guldin - Power Users, Long Tail Users, and Everything In Between... | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=Yd35Q2oclY8" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json new file mode 100644 index 000000000..0b9302106 --- /dev/null +++ b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "This informative talk aims to close the gap between the theory of data contracts and their real-life implementations. It contains a few Python code snippets and is aimed primarily at data and software engineers. However, it could be food for thought for machine learning engineers, data scientists, and other data consumers.\n\nTopic: There are a lot of ongoing discussions happening about data contracts. I would like to share with you some lessons learned from data contract implementations and show you some Python examples.\n\nAudience: data and software engineers; potentially could be interesting for machine learning engineers, data scientists, and other data consumers. Some affinity with Pandas, Great Expectations, and Open Table Formats are desirable.\n\nType: Informative with some hands-on examples\n\nMain takeaways:\n- better understanding of the data contracts concept\n- tips for batch data contracts implementations\n- tips for streaming data contracts implementations\n\nBio: \nAlyona Galyeva\nAlyona Galyeva is an organizer of PyLadies Amsterdam, co-organizer of MLOps and Crafts, Microsoft AI MVP and Principal Engineer at Thoughtworks\nObserve - Optimize - Learn - Repeat\nPassionate about encouraging others to see different perspectives and constructively break the rules.\nI found my joy in building, optimizing, and deploying end-to-end AI and Data Engineering Solutions.\n\n===\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1504, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/YGKqvMhaEVA/maxresdefault.jpg", + "title": "Alyona Galyeva - Data Contracts in action powered by Python open source ecosystem | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=YGKqvMhaEVA" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json new file mode 100644 index 000000000..d638f6031 --- /dev/null +++ b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Have you ever struggled with a multitude of columns created by One Hot Encoder? Or decided to look beyond it, but found it hard to decide which feature encoder would be a good replacement?\n\nGood news, there are many encoding techniques that have been developed to address different types of categorical data. This talk will provide an overview on various encoding methods available in data science, and a guidance on decision making about which one is appropriate for the data at hand.\n\nJoin this talk if you would like to hear about the importance of feature encoding and why it is important to not default to One Hot Encoding in every scenario. It will start with commonly used approaches and will progress into more advanced and powerful techniques which can help extract meaningful information from the data.\n\nFor each presented encoder, after this talk you will know:\n- When to use it\n- When NOT to use it\n- Important considerations specific to the encoder\n- Python library that offers a built-in method with the encoder, facilitating easy integration into feature engineering pipelines.\n\nI will explore different feature encoding approaches and provide guidance for decision-making. I will cover simpler methods like Label, One Hot, and Frequency encoding, progressing to powerful techniques like Target and Rare Label encoding. Finally, I will explain more complex approaches like Weight of Evidence, Hash and Catboost encoding. I will close the talk with summarizing the key takeaways.\n\nTarget Audience:\nData scientists and anyone interested in feature encoding\n\nPrevious experience with feature encoders can be useful but is not mandatory to follow the talk.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1628, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/4Opsiqj6gcY/maxresdefault.jpg", + "title": "Ana Chaloska - To One-Hot or Not: A guide to feature encoding and when to use what | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=4Opsiqj6gcY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json new file mode 100644 index 000000000..133be0e05 --- /dev/null +++ b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Pick your next hot LLM prompt using a Bayesian tournament! Get a quick LLM dopamine hit with a side of decision theory vegetables. It's Bayesian Thunderdome: many prompts enter, one prompt leaves.\n\nHow do you chose the best LLM prompt systematically beyond guessing and vibes? Use the winner of a Bayesian tournament! Get a quick dopamine hit from fun LLM prompt magic with a side of Bayesian decision theory vegetables. If you are doing stuff with LLMs \u2014 you'll get a serious tool to improve your prompting game. If you're not using LLMs \u2014 you'll learn about Bayesian tournaments. They are not well known but have wide applicability: they help you optimally choose a winner using a minimal number of matches.\n\nBio:\nAndy Kitchen\nI've helped found multiple start-ups, including CorticalLabs an AI+Biotech company working on \"Synthetic Biological Intelligence\". I've co-authored several papers and patents in deep learning and neuroscience. I've made a mess in more than a dozen programming languages over my career. My stack is full. I've worked on custom neural interface hardware to web apps and everything in between. I've won a few hack-a-thons. I started the Machine Learning and AI meetup in Melbourne Australia, helped found & organize the Compose :: Melbourne conference. I have two cats, I scoop their poop most days.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1746, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/UY3wxjk2o6o/maxresdefault.jpg", + "title": "Andy Kitchen - Promptly Evaluating Prompts with Bayesian Tournaments | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=UY3wxjk2o6o" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json new file mode 100644 index 000000000..189ae5f3a --- /dev/null +++ b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json @@ -0,0 +1,32 @@ +{ + "description": "Discover how to bridge the gap between traditional machine learning and the rapidly evolving world of AI with skorch. This package integrates the Hugging Face ecosystem while adhering to the familiar scikit-learn API. We will explore fine-turing of pre-trained models, creating our own tokenizers, accelerating model training, and leveraging Large Language Models.\n\nThe machine learning world is evolving quickly, AI is talked about everywhere, with the Hugging Face ecosystem being in the midst of it. For traditional machine learning users, especially coming from scikit-learn, keeping up can be quite overwhelming. With the help of the skorch package, it is possible to marry the best of both worlds. It allows you to integrate with many of the Hugging Face features while conforming to the sklearn API.\n\nIn this talk, I'll give a brief introduction to skorch. Then we will learn how to use it to tap into the Hugging Face ecosystem, benefiting from: using pre-trained models and fine-tuning them, working with tokenizers as if they were sklearn transformers, accelerating model training, and even using Large Language Models as zero-shot classifiers. I'll discuss some benefits and drawbacks of this approach.\n\nThis talk should be of interest to you if you're coming from the scikit-learn world and are interested in the latest deep learning developments. Familiarity with scikit-learn and a little bit of PyTorch knowledge is recommended.\n\nBio:\nBenjamin Bossan\nI worked as a Data Scientist and Head of Data Science for a couple of ears, now I'm Machine Learning Engineer at Hugging Face. I'm also a maintainer of the skorch package (https://github.com/skorch-dev/skorch).\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1422, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/skorch-dev/skorch", + "url": "https://github.com/skorch-dev/skorch" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/y_n7BjDCS-M/maxresdefault.jpg", + "title": "Bossan - Extend your scikit-learn workflow with Hugging Face and skorch | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=y_n7BjDCS-M" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json new file mode 100644 index 000000000..1e22e1216 --- /dev/null +++ b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json @@ -0,0 +1,28 @@ +{ + "description": "Some say machine learning projects fail because they live in notebooks.\n\nBut I would bet that even more of them fail because their projects solve a problem that doesn\u2019t exist. Or uses an interface that\u2019s not feasible. In other words, they fail because they don\u2019t validate their underlying assumptions.\n\nProduct analytics helps build models that solve real problems. In my time at ING, I\u2019ve been dealing with a lot of the latter, and I\u2019ll be sharing my thoughts on how to find problems worth solving with data science.\n\nBio:\nAzamat Omuraliev\nAzamat Omuraliev is a Senior Data Scientist at ING. Cracking the problem of personalization since joining ING in 2020! Decided to stay on this topic because it\u2019s a challenge that requires getting many things right: constructing the right kind of machine learning model, staying in touch with customers and handling millions of interactions daily. Thanks to that, still learning something new on the job every single day.\n\nOriginally from Kyrgyzstan, moved to the Netherlands for studies but stayed for friends and for Amsterdam \u2764\ufe0f\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1529, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/NypFnCRjXJQ/maxresdefault.jpg", + "title": "Building true Machine Learning MVPs: Validating the value chain as a product data scientist", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=NypFnCRjXJQ" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json new file mode 100644 index 000000000..4d3708833 --- /dev/null +++ b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json @@ -0,0 +1,28 @@ +{ + "description": "Feature Stores are a vital part of the MLOps stack for managing machine learning features and ensuring data consistency. This talk introduces Feature Stores and the underlying data management architecture. We\u2019ll then discuss the challenges and learnings of integrating DuckDB and Arrow Flight into the our Feature Store platform, and share benchmarks showing up to 30x speedups compared to Spark/Hive. Discover how DuckDB and ArrowFlight can also speedup your data management and machine learning pipelines.\n\nIn this talk, we will cover the following topics:\n\n\u2022 Introduction to Machine Learning Feature Stores (5 min): Understanding the role of feature stores in the MLOps stack and their significance in managing machine learning features within organizations.\n\u2022 Data management architecture behind Feature Stores (2-3 min): Exploring the underlying mechanisms and data management components employed in feature stores.\n\u2022 Introduction to DuckDB and Arrow Flight (5 min): Highlighting the integration of DuckDB and Arrow Flight into the PyData ecosystem, leveraging the capabilities of Arrow.\n\u2022 The journey of integrating DuckDB and Arrow Flight into our Feature Store platform (12 min): Sharing our experiences and insights on integrating DuckDB and Arrow Flight into the Hudi-based Lakehouse platform that powers our (offline) feature store, discussing challenges and successes encountered along the way.\n\u2022 Benchmarks (5 min): Presenting a benchmark comparing the performance of DuckDB/Arrow Flight vs Spark/HiveServer2, in particular for small to medium sized data.\n\nAttendees will gain a deeper understanding of feature stores, insights into the integration of DuckDB and ArrowFlight into the PyData ecosystem, and practical knowledge on enhancing the performance of machine learning pipelines.\n\nBio:\nFabio Buso\nFabio Buso is VP of Engineering at Hopsworks, leading the Feature Store development team. Fabio holds a master\u2019s degree in Cloud Computing and Services with a focus on data intensive applications.\n\nTill D\u00f6hmen\nTill D\u00f6hmen is a Research Engineer at Hopsworks, where he is contribibuting to the development of Hopswork's Python-centric Feature Store platform. In addition to his work at Hopsworks, he is a guest researcher at the Intelligent Data Engineering Lab of the University of Amsterdam and engages in research at the intersection of data management and machine learning.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1254, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/w_cAeE5ShnM/maxresdefault.jpg", + "title": "Buso & D\u00f6hmen - MLOps on the fly: Optimizing a feature store with DuckDB and ArrowFlight | PDAMS 23", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=w_cAeE5ShnM" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json new file mode 100644 index 000000000..797b4e4d8 --- /dev/null +++ b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Did you know that you could do transfer learning on boosted forests too? Even in current days, we face business cases where the modelling sample is very low. This brings an uncertainty to the modelling results and in some cases no ability to model at all. To counter it, we investigated the ability to use transfer learning approaches on boosting models. In this talk, we would like to show the methods used and results from a real case example applied to the credit risk domain.\n\nTransfer learning (TL), a form of machine learning, involves leveraging knowledge acquired while addressing one task and applying it to a related task. While TL is mainly associated with deep learning tasks, it is also applicable to boosting algorithms which are commonly used in advanced credit risk modelling.\n\nDuring the talk, we present a real use-case involving building a probability of default (PD) model for a customer segment with small data history within the bank. There can be several ways to benefit from data coming from other customer segments with already rich data available within the bank.\n\nSimple approaches would be:\n- Fit a model on only rich data & just apply to the limited data\n- Fit a model on both data sets, but tune it on the limited data\n\nMore complex (TL) approaches:\n- Fit a model on rich data with sample weights come from resemblance analysis to calculate similarity between these two data sources.\n- Use refitting with the limited data on the model trained on rich data\n- Start with an initial pre-trained model while modelling on the limited data\n\nJoin us for an engaging session where we will share the outcomes of our experiments and lessons learned, as we address these approaches that hold relevance beyond the presented use-case, offering practical applicability for similar scenarios in your own domain.\n\nBios:\nBusra Cikla\nBusra is an experienced data scientist with passion for analytics at ING\u2019s Risk & Pricing Advanced Analytics Team in Amsterdam. She has designed and developed end-to-end advanced analytics solutions to a business problem in different domains during the last 5 years at ING. Currently, she is working on real-time credit risk models by using ML. Busra has a background on optimisation and operational research from her B.Sc. study and she has M.Sc. degree on Data Science.\n\nPaul Zhutovsky\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1729, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/lmQw_B-JP9o/maxresdefault.jpg", + "title": "Cikla & Zhutovsky - Transfer Learning in Boosting Models | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=lmQw_B-JP9o" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json new file mode 100644 index 000000000..da3e46ea2 --- /dev/null +++ b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Survival analysis was initially introduced to handle the data analysis required in use cases revolving death and treatment in health care. Due to its merit, this method has spread to many other domains for analyzing and modeling the data where the outcome is the time until an event of interest occurs. Domains such as finance, economy, sociology and engineering.\n\nThis talk aims at unraveling the potential of survival analysis with examples from different domains. A taxonomy of the existing descriptive and predictive analytics algorithms in survival analysis are demonstrated. The concept of some candidate algorithms from each group are explained in detail, along with an example and implementation guideline using the right open source framework.\n\nThis talk aims at introducing the tools and techniques within the survival analysis domain for analyzing the time until an event of interest occurs. Examples of such event are rehospitalization after being discharged from hospital (healthcare), device needing maintenance after (re)commissioning (manufacturing), finding a job after unemployment (economy), an asset being sold after listing for sale (real-estate/finance), getting rearrested after being released from prison (criminology/sociology), and many other examples.\n\nThe potential of survival analysis tools, in both descriptive and predictive analytics, are hidden to the data science community. As a result of this, such problems are often formulated as classification or regression, where this also comes with its own caveats and pitfalls.\n\nThe aim of the talk is to simplify methods and algorithms in survival analysis with some shallow mathematical focus and starts by raising awareness about survival analysis and its potential and applications for the general audience. The descriptive and predictive algorithms within survival analysis address the data scientists with basic statistics and machine learning background, as the main audience of the talk.\n\nIntroduction to Survival Analysis\nApplications in different domains\nFormulating Survival Analysis Problem\nTaxonomy of Descriptive & Predictive Methods with python packages\nOverview of Descriptive Methods\n- Kaplan-Meier [3 slide]\n- Nelson-Aalen & Weibull [half slide]\nOverview of Predictive Methods\n- Cox Proportional Hazard [3 slide]\n- Survival Tree & Forrest [1 slide]\n- Deep Survival Analysis [1 slide]\nConclusion\nAt the end of the talk, the audience becomes aware of what survival analysis can do and which algorithms, with their corresponding python package, are the low hanging fruit in a data scientist toolbox. In addition, the audience will gain a structured overview on the topic so that any need for further knowledge acquisition could be independently followed in the future.\n\nBio:\nDanial Senejohnny\nI am a data scientist with a background in applied mathematics (systems & control). In my career as data scientist, I have experienced different sectors, i.e. manufacturing, cybersecurity, healthcare, and finance. Currently, I am contributing to data-driven solutions that improve our clients\u2019 experience and satisfaction within ABN AMRO.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1386, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/I33h5-GmHSM/maxresdefault.jpg", + "title": "Danial Senejohnny - Survival Analysis: a deep dive | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=I33h5-GmHSM" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json new file mode 100644 index 000000000..71b0dd642 --- /dev/null +++ b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json @@ -0,0 +1,32 @@ +{ + "description": "Back in 2018, a blogpost titled \"Data's Inferno: 7 circles of data testing hell with Airflow\" presented a layered approach to data quality checks in data applications and pipelines. Now, 5 years later, this talk looks back at Data's Inferno and surveys what has changed but also what hasn't in the space of ensuring high data quality.\n\n5 years ago a blog post called \"Data's Inferno\" (https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8) was written about how to ensure high data quality with Apache Airflow. It suggested using different types of tests as layers to catch issues lurking within the data. These layers included tests for Airflow DAG integrity, mock data pipelines, production data tests, and more. Combining these layers made for a reliable way to filter out incorrect data. Despite the blogpost's age, the ideas are still relevant today. New tools and applications have been developed to help improve data quality as well as new best practices. In this talk, we'll review the layers of Data's Inferno and how they contributed to improving data quality. We'll also look at how new tools address the same concerns. Finally, we'll discuss how we expect and hope the data quality landscape to evolve in the future.\n\nBio:\nDaniel van der Ende\nDaniel van der Ende is a Data Engineer at Xebia Data. He enjoys working on high performance distributed computation with Spark, empowering data scientists by helping them to run their models on very large datasets with high performance. He is an Apache Spark and Apache Airflow contributor and speaker at conferences and meetups.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1465, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8", + "url": "https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/05py1CEyLxo/maxresdefault.jpg", + "title": "Daniel van der Ende- Return to Data's Inferno: are the 7 layers of data testing hell still relevant?", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=05py1CEyLxo" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json new file mode 100644 index 000000000..a7e26e2dd --- /dev/null +++ b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "How can you evaluate your production models when the data is not structured and you have no labels? To start, by tracking patterns and changes in the input data and model outputs. In this talk, I will give an overview of the possible approaches to monitor NLP and LLM models: from embedding drift detection to using regular expressions.\n\nOnce LLMs or NLP models are in production, you want to ensure they work as intended. But how can you observe their behavior in the wild and detect when something goes wrong?\n\nFirst, you often lack true labels. To add to this, the data is unstructured - how exactly can you track a pile of texts?\n\nMonitoring the patterns in the input data and model outputs is often the first line of defense. In the talk, I will review possible approaches to monitoring drift and data quality issues in text data and explain their pros and cons.\n\nI will cover:\n- Statistical embedding drift detection\n- Tracking interpretable text descriptors like text length and sentiment\n- Using regular expressions to validate outputs\n- Explaining drift through model-based drift detection\n- Detecting changes in multi-modal data\n\nI will also introduce open-source tools, models, and visualization techniques one can use to monitor LLM and NLP models.\n\nThis talk will benefit data scientists and machine learning engineers who work with NLP and LLM in production.\n\nBio:\nEmeli Dral\nEmeli Dral is a Co-founder and CTO at Evidently AI, a startup developing open-source tools to evaluate, test, and monitor the performance of machine learning models.\n\nEarlier, she co-founded an industrial AI startup and served as the Chief Data Scientist at Yandex Data Factory. She led over 50 applied ML projects for various industries - from banking to manufacturing. Emeli is a data science lecturer at GSOM SpBU and Harbour.Space University. She is a co-author of the Machine Learning and Data Analysis curriculum at Coursera with over 100,000 students.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1519, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/aLwDCU8KZB0/maxresdefault.jpg", + "title": "Emeli Dral - Mind the language: how to monitor NLP and LLM in production | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=aLwDCU8KZB0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json new file mode 100644 index 000000000..a6026a976 --- /dev/null +++ b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json @@ -0,0 +1,28 @@ +{ + "description": "Optimizing machine learning models using regular metrics is a common practice in the industry. However, aligning model optimization with business metrics is closely tied to the objectives of the business and is highly valued by product managers and other stakeholders. This talk delves into the process of training machine learning models based on business metrics in order to enhance economic outcomes. With a primary focus on data scientists and machine learning practitioners, this talk explores techniques, methodologies, and real-world applications that harness the power of business metrics to propel machine learning models and foster business success. We will present a specific case study that demonstrates how we utilized business metrics at Booking.com that brought significant impact on model performance on business outcomes. Specifically, we will discuss our approaches to leveraging business metrics for hyperparameter tuning and reducing model complexity, which instill greater confidence within our team when deploying improved models to production.\n\nDescription\nThis talk aims to equip data scientists and machine learning practitioners with the knowledge and tools to train machine learning models on business metrics effectively. We will delve into the process of hyperparameter tuning, algorithm selection, and model evaluation specifically tailored for optimizing economic outcomes. A real-world use case at Booking.com will demonstrate the transformative power of this approach.\n\nOutline\n- Introduction to training machine learning models on machine learning metrics versus business metrics\n- Overview of the significance of leveraging business metrics to improve machine learning models' performance on business metrics\n- Introduction to machine learning algorithms suitable for modeling business metrics to drive economic optimizations\n- Metrics and evaluation, and training techniques specific to assessing the business impact of machine learning models\n- Showcasing practical use case at Booking.com where training models on business metrics has led to significant improvements in economic outcomes\n\nCentral Focus\nTraining machine learning models on business metrics present a powerful methodology for optimizing economic outcomes. By incorporating relevant business data and metrics into the modeling process, data scientists and machine learning practitioners can drive substantial improvements in economic performance. This talk will provide attendees with the necessary insights and techniques to apply this approach successfully.\n\nKey Takeaways\n- Understanding the importance of training machine learning models on business metrics for economic optimizations\n- Familiarity with machine learning algorithms suitable for modeling business metrics and driving economic outcomes\n- Strategies for evaluating and quantifying the economic impact of machine learning models\nReal-world inspiration and practical insights for applying this approach to boost economic outcomes\n\nWe aim to deliver an informative and practical talk that caters to data scientists and machine learning practitioners. Attendees will gain actionable insights, methodologies, and real-world examples to effectively train machine learning models on business metrics, leading to enhanced economic outcomes.\n\nBio:\nFelipe Moraes\nI am a machine learning scientist at Booking.com working on personalized discounts under budget constraints.\nI have a PhD in Computer Science from the Delft University of Technology. During my PhD, I interned as an applied scientist at Amazon Alexa Shopping, where I worked on finding proxies for what customers find relevant when comparing products during their search shopping journey in order to empower Amazon recommendation systems. Before that I obtained a BSc and MSc in Computer Science from the Federal University of Minas Gerais, visited research labs at NYU and the University of Quebec, and worked as a software engineer intern in a news recommendation system start up.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1284, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/XUFS-jVpKIc/maxresdefault.jpg", + "title": "Enhancing Economic Outcomes: Leveraging Business Metrics for Machine Learning Model Optimization", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=XUFS-jVpKIc" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json new file mode 100644 index 000000000..bf7689921 --- /dev/null +++ b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "With Apache Iceberg, you store your big data in the cloud as files (e.g., Parquet), but then query it as if it\u2019s a plain SQL table. You enjoy the endless scalability of the cloud, without having to worry about how to store, partition, or query your data efficiently. PyIceberg is the Python implementation of Apache Iceberg that loads your Iceberg tables into PyArrow (pandas), DuckDB, or any of your preferred engines for doing data science. This means that with PyIceberg, you can tap into big data easily by only using Python. It\u2019s time to say goodbye to the ancient Hadoop-based frameworks of the past! In this talk, you'll learn why you need Iceberg, how to use it, and why it is so fast.\n\nDescription: Working with high volumes of data has always been complex and challenging. Querying data with Spark requires you to know how the data is partitioned, otherwise, your query performance suffers tremendously. The Apache Iceberg open table format fixes this by fixing the underlying storage, instead of by educating the end users. Iceberg originated at Netflix and provides a cloud-native layer on top of your data files. It solves traditional issues regarding correctness by supporting concurrent reading and writing to the table. Iceberg improves performance dramatically by collecting metrics on the data, having the ability to easily repartition your data, and being able to compact the underlying data. Finally, it supports time travel, so the model that you're training doesn't change because new data has been added. After this talk, you'll be comfortable using Apache Iceberg.\n\nMinutes 0-5: History and why we need a table format\nMinutes 5-15: Overview of Iceberg, and how it works under the hood\nMinutes 15-30: Introduction to PyIceberg with code and real examples (notebook!!)\n\nBio:\nFokko Driesprong\nOpen Source enthousiast. Committer on Avro, Parquet, Druid, Airflow and Iceberg. Apache Software Foundation members.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1289, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/1A7fFB8QTPY/maxresdefault.jpg", + "title": "Fokko Driesprong - PyIceberg: Tipping your toes into the petabyte data-lake | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=1A7fFB8QTPY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json new file mode 100644 index 000000000..49aca70d7 --- /dev/null +++ b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json @@ -0,0 +1,40 @@ +{ + "description": "In this talk, we discuss how we can use the python package PySTAN to estimate the Lifetime Value (LTV) of the users that can be acquired from a marketing campaign, and use this estimate to find the optimal bidding strategy when the LTV estimate itself has uncertainty. Throughout the presentation, we highlight the benefits from using Bayesian modeling to estimate LTV, and the potential pitfalls when forecasting LTV. By the end of the presentation, attendees will have a solid understanding of how to use PySTAN to estimate LTV, optimize their marketing campaign bidding strategies, and implement the best Bayesian modelling solution. All of the contents and numbers in this presentation can be found in the shared GIT\n\nWe describe how to use the PySTAN to forecast the LTV of the marketing campaigns. PySTAN is a Python interface to STAN, which is a package for Bayesian inference capable of high-performance statistical computation. PySTAN\u2019s computation speed is essential in a marketing context, where we need to predict the LTV of multiple marketing campaigns over a long period, while still estimating the LTV distribution. We demonstrate how to implement a PySTAN model to predict a time-series using the Lifetime Value data from Kaggle [2], which contains approximately 200 days, in less than 2 minutes.\nWe then compare how we can achieve the exact same model with PyMC, another well-known probabilistic modelling library, and in which situations and conditions PySTAN outperforms PyMC.\n\nWith the LTV accurately predicted for the Lifetime Value data, we explain the steps to optimize the bid of marketing campaigns under uncertainty about the accuracy of our predictions. We show how different levels of uncertainty of our LTV predictions can change the optimal bidding strategy and answer questions such as \u201cHow much should we underbid when we are unsure of our LTV?\u201d.\nBy the end of the presentation, attendees will be able to implement PySTAN or PyMC to estimate LTV, know which of these two libraries is most appropriate for their needs, and apply this knowledge to find the best bidding strategy for their marketing campaigns.\n\nIn this presentation, we will thus cover the following topics:\n\nIntroduction to digital advertisement\n- Modelling advertisement for digital products\n- How to find the optimal bid for your marketing campaign\n- The role that uncertainty on the estimated LTV plays in your marketing strategy\n\nForecasting LTV with PySTAN\n- What is PySTAN\n- How to use PySTAN to estimate the LTV of your marketing campaigns\n- How to achieve the same model through PyMC\n- Comparison between PySTAN and PyMC\n\nReferences\n- The Duopoly is over because everything is an ad network[ [https://mobiledevmemo.com/the-duopoly-is-over-because-everything-is-an-ad-network/]\n- Lifetime Value data from Kaggle: https://www.kaggle.com/datasets/baetulo/lifetime-value?select=train.csv\n- Why Uncertainty Matters when forecasting Lifetime Value: https://raphaeltamaki.github.io/raphaeltamaki/posts/Forecasting%20Customer%20Lifetime%20Value%20-%20Why%20Uncertainty%20Matters/\n\nBio:\nRaphael de Brito Tamaki\nData Science Lead in the Marketing Science @Meta, where I use causal inference techniques to extract insights to help advertisers increase their marketing performance. Prior to joining Meta, I worked at Wildlife Studios - a mobile game studio with over 2B total downloads - where I was the Tech Lead for the Lifetime Value (LTV) prediction team, and implemented and maintained LTV models in production for over 10 games\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1721, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://www.kaggle.com/datasets/baetulo/lifetime-value?select=train.csv", + "url": "https://www.kaggle.com/datasets/baetulo/lifetime-value?select=train.csv" + }, + { + "label": "https://raphaeltamaki.github.io/raphaeltamaki/posts/Forecasting%20Customer%20Lifetime%20Value%20-%20Why%20Uncertainty%20Matters/", + "url": "https://raphaeltamaki.github.io/raphaeltamaki/posts/Forecasting%20Customer%20Lifetime%20Value%20-%20Why%20Uncertainty%20Matters/" + }, + { + "label": "https://mobiledevmemo.com/the-duopoly-is-over-because-everything-is-an-ad-network/", + "url": "https://mobiledevmemo.com/the-duopoly-is-over-because-everything-is-an-ad-network/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/hcQST0RnN_o/maxresdefault.jpg", + "title": "Forecasting Customer Lifetime Value (CLTV) for Marketing Campaigns under Uncertainty with PySTAN", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=hcQST0RnN_o" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json new file mode 100644 index 000000000..b72d01d5e --- /dev/null +++ b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "In this talk, we will explore the Bayesian Bradley Terry model implemented in PyMC. We will focus on its application for ranking tennis players, demonstrating how this probabilistic approach can provide an accurate and robust rankings, arguably better than the ATP ranking itself and the Elo rating system.\n\nBy leveraging the power of Bayesian statistics, we can incorporate prior knowledge, handle uncertainty, and make better inferences about player abilities. Join us to learn how to implement the Bayesian Bradley Terry model in PyMC and discover its advantages for ranking tennis players.\n\nThe Bradley Terry model is a powerful model to predict the outcome of a paired comparison, as a by-product we will be able to rank players based on their hidden (latent) ability scores. Traditionally, rankings have been based on simple win-loss records, which may not capture the true abilities of players due to variations in competition quality and sample size. By adopting a Bayesian framework, we can overcome these limitations and obtain more reliable rankings.\n\nIn this talk, we will introduce the Bayesian Bradley Terry model and its underlying principles. We will explore how to encode the model in Python using the PyMC library. We will walk through the step-by-step implementation, highlighting key considerations and practical tips.\n\nTo illustrate the model's effectiveness, we will showcase its application to ranking tennis players, and compare it with both the official ATP ranking and the ELO ranking system. Tennis provides an ideal domain for this analysis, as it involves head-to-head matches between players, allowing us to directly compare their abilities. By applying the Bayesian Bradley Terry model to historical tennis match data, we can generate rankings that better reflect players' true skills, accounting for factors such as opponent strength and match surface.\n\nThroughout the talk, we will emphasize a hands-on approach, providing code examples and demonstrations. Attendees will gain a solid understanding of the model, learn how to implement it using PyMC, a practical application, possible extensions and maybe a few PyMC tricks along the way.\n\nOutline\nWhat's wrong with current tennis ranking.\nIntroduction to the Bayesian Bradley Terry model.\nImplementation of the model in PyMC.\nApplication to ranking tennis players by latent ability score.\nComparison with ATP ranking and ELO rating system.\nPossible extensions and other applications.\n\nBio:\nFrancesco Bruzzesi\nData scientist at HelloFresh with a background in pure mathematics.\nOpen source enthusiast and ML practitioner.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1513, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/z79fClXBOnM/maxresdefault.jpg", + "title": "Francesco Bruzzesi - Bayesian ranking for tennis players in PyMC | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=z79fClXBOnM" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json new file mode 100644 index 000000000..7398c9c52 --- /dev/null +++ b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "In the world of computer vision, the focus is often on cutting-edge neural network architectures. However, the true impact usually lies in designing a robust system around the model to solve real-world business challenges. In this talk, we guide you through the process of building practical computer vision pipelines that leverage techniques such as segmentation, classification, and object tracking, demonstrated by our predictive maintenance application at Port of Rotterdam. Whether you're an experienced expert seeking production-worthy pipelines or a novice with a background in data science or engineering eager to dive into image and video processing, we will explore the use of open-source tools to develop and deploy computer vision applications.\n\nThis talk provides a comprehensive demonstration of creating a powerful computer vision pipeline using widely-used libraries such as PyTorch, torchvision, and OpenCV. We break the pipeline down into manageable components, discussing the importance of proper separation of concerns. Onboarding new use cases becomes a breeze when following best practices in the project structure, combined with user-friendly command-line interfaces. Efficient development and validation processes are ensured by designing a sane data model and writing useful tests. Additionally, we explore the critical topic of maintainability, applying MLOps principles for long-term success.\n\nTo bring these concepts to life, we present a real-world application: the Machine Learning Inspector. This predictive maintenance tool, deployed at the Port of Rotterdam, automatically detects and inspects objects in video streams from trucks and ships, delivering actionable insights. We discuss how we work together with asset inspectors to capture their knowledge of the real world in our artificially intelligent computer vision tool.\n\nJoin us in this talk to gain practical knowledge and valuable insights for designing, deploying, and maintaining computer vision pipelines that drive tangible impact. We aim to empower the audience to build their own computer vision pipelines; with the right design philosophy, every data professional should be able to build computer vision pipelines that might be complex, but not complicated.\n\nBio:\nWesley Boelrijk\nWesley is the Lead Machine Learning Engineer at Xccelerated (part of Xebia). There, he trains and guides junior-to-medior ML Engineers in Xccelerated's one-year program. Besides that, he works as an MLE on various projects, recently at KLM, ProRail, and Port of Rotterdam. In his free time, he likes to stay up-to-date in the ML ecosystem and play around with computer vision.\n\nJeroen Rombouts\nJeroen is an expert in machine learning and AI, specializing in transforming ideas and proof-of-concepts into value-driven products. Leveraging deep expertise in data science and engineering, he offers practical solutions to enhance machine learning infrastructure and elevate data teams' AI skills.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1537, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/z2DJzByPKJE/maxresdefault.jpg", + "title": "From Vision to Action: Designing and Deploying Effective Computer Vision Pipelines | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=z2DJzByPKJE" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json new file mode 100644 index 000000000..d71d8ca1d --- /dev/null +++ b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json @@ -0,0 +1,24 @@ +{ + "description": "This talk explores distillation learning, a powerful technique for compressing and transferring knowledge from larger neural networks to smaller, more efficient ones. It delves into its core components and various applications such as model compression and transfer learning. The speaker aims to simplify the topic for all audiences and provides implementation, demonstrating how to apply distillation learning in real scenarios. Attendees will gain insights into developing efficient neural networks by reviewing the various examples of the complex model. The material will be accessible online for convenient access and understanding.\n\nAs the field of artificial intelligence continues to advance, the demand for more efficient and compact neural network models has become increasingly vital. The ability to compress and transfer knowledge from larger, complex models to smaller, more efficient models has emerged as a powerful solution. In this talk, we aim to shed light on the significance of distillation learning and its applications across various domains.\n\nIn an era where data sizes and computational requirements are escalating, distillation learning provides a compelling solution to address the challenges posed by these factors. By utilizing a teacher-student framework, this approach facilitates the transfer of knowledge from a larger, well-performing teacher model to a smaller student model. The student model is trained to mimic the behaviour and output of the teacher model, thereby inheriting its expertise. This process enables the creation of compact models that are not only efficient in terms of memory and inference speed but also capable of performing tasks with comparable proficiency. Distillation learning represents a breakthrough in model compression and transfer learning, revolutionizing the field of artificial intelligence and novel machine learning utilising deep neural networks.\n\nIn this talk, we will provide a comprehensive overview of distillation learning, covering its core components. We will explore the definition and motivation behind, highlighting the role of the teacher model in guiding the student model and the objective of the student model to replicate the teacher model's output. Additionally, we will discuss the diverse applications, including model compression, transfer learning, ensemble learning, multi-task learning, and language models. We will also delve into different types of this learning approach, such as model distillation, knowledge distillation, multi-task distillation, and transfer distillation.\n\nThis talk facilitates knowledge exchange and inspires the development of efficient neural networks. The speaker simplifies the topic, making it accessible to all audiences. Simple practical implementation in TensorFlow will be demonstrated, showcasing how attendees can apply this technique in real scenarios. No expertise in complex models is required, and the material will be shared online for convenient access and comprehension.\n\nBio:\nHadi Abdi Khojasteh\nHadi is an R&D senior machine learning engineer at the Deltatre group, where he is an integral member of the innovation lab and a fellow at the Sport Experiences unit, based in Czechia and Italy. With a solid academic background, Hadi is a former lecturer at the Institute for Advanced Studies in Basic Sciences (IASBS) in Iran and as a researcher at the Institute of Formal and Applied Linguistics (\u00daFAL) at Charles University in Prague. Throughout his career, he has actively participated in numerous industrial projects, collaborating closely with renowned experts in the fields of CV/NLP/HLT/CL/ML/DL. His research focuses on multimodal learning inspired by neural models that are both linguistically motivated and tailored to language and vision, visual reasoning and deep learning. His main research interests are Machine Learning, Deep Learning, Computer Vision, Multimodal Learning and Visual Reasoning while he is experienced in a wide variety of international projects on cutting-edge technologies.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.", + "duration": 1314, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/2YmGm0yf6fc/maxresdefault.jpg", + "title": "Hadi Abdi Khojasteh - Distillation Unleashed: Domain Knowledge Transfer with Compact Neural Networks", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=2YmGm0yf6fc" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json new file mode 100644 index 000000000..c206de22e --- /dev/null +++ b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json @@ -0,0 +1,28 @@ +{ + "description": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy transition\n\nHow can probabilistic forecasting accelerate the renewable energy transition? The rapid growth of non-steerable and intermittent wind and solar power requires accurate forecasts and the ability to plan under uncertainty. In this talk, we will make a case for using probabilistic forecasts over deterministic forecasts. We will cover methods for generating and evaluating probabilistic forecasts, and discuss how probabilistic price and wind power forecasts can be combined to derive optimal short-term power trading strategies.\n\nBio:\nAlexander Backus\nAlexander is Data Science Manager at Dexter Energy, where he is currently leading the development of machine learning-powered short-term power trading optimization products. He brings extensive hands-on machine learning engineering and data science management experience from various industries, including organizations such as KLM Royal Dutch Airlines, ING Bank, Heineken, VodafoneZiggo and IKEA.\n\n ===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1575, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/aIZf2cQ0r5U/maxresdefault.jpg", + "title": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy...", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=aIZf2cQ0r5U" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json new file mode 100644 index 000000000..d96561318 --- /dev/null +++ b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json @@ -0,0 +1,28 @@ +{ + "description": "Keynote by Thomas Wolf. He will be accompanied on stage by Alessandro Cappelli, Julien Launay & Guilherme Penedo, all members of the Hugging Face team in Amsterdam working on large model training.\n\nBio:\nThomas Wolf\nThomas Wolf is a co-founder and Chief Science Officer at Hugging Face. He is passionate about creating open-source software that makes complex research accessible, and most proud of creating the Transformers and Datasets libraries as well as the Magic-Sand tool. When he\u2019s not building OSS, he pushes for open-science in research in AI/ML, trying to lower the gap between academia and industrial labs. His current research interests are centered around overcoming the current limitations of LLMs with multi-modalities and complementary approaches.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 2811, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/iQmXF5gxaWY/maxresdefault.jpg", + "title": "Hugging Face\"Processing billions of tokens for training Large Language Models, tools and knowledge\"", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=iQmXF5gxaWY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json new file mode 100644 index 000000000..7648ca86d --- /dev/null +++ b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "The proof of the pudding is in the (way of) eating: quasi-experimental methods of causal inference and their practical pitfalls\n\nData scientists and analysts are using quasi-experimental methods to make recommendations based on causality instead of randomized control trials. While these methods are easy to use, their assumptions can be complex to explain. This talk will explain these assumptions for data scientists and analysts without in-depth training of causal inference so they can use and explain these methods more confidently to change people's minds using data.\n\nInstead of relying solely on randomized control trials (also known as A/B tests), which are considered the gold standard for inferring causality, data scientists and analysts are increasingly turning to quasi-experimental methods to make recommendations based on causality. These methods, including open-source libraries such as CausalImpact (originally an R package but with numerous Python ports), are easy to use, but their assumptions can be complex to explain. I will break down these assumptions and explain how they can help practitioners determine when to use these methods (and when not to use them), using examples from the world of digital language learning. The key takeaway is that when it comes to changing people's minds using data, explaining assumptions to decision-makers is just as important as understanding the underlying statistics.\n\nOutline\n- Minute 0-5: Introduction and Motivation\n- Minute 5-10: Difference-in-Difference / Bayesian Structural Time-Series\n- Minute 10-15: Case - Conversion effects of content changes based language-pair specific releases at Babbel\n- Minute 15-20: Regression Discontinuity Design\n- Minute 20-25: Case - Estimating motivational effects of language assessment\n- Minute 25-30: Wrap-up / Take-Aways\n\nBio:\nJakob Willisch\nAs Head of Product Data at Babbel, I lead data-scientists, analysts and engineers to improve decision-making of people and machines. Before joining Babbel I did quantitative research in Political Science and Political Economy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1454, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/bABsVVbMyJc/maxresdefault.jpg", + "title": "Jakob Willisch - The proof of the pudding is in the (way of) eating... | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=bABsVVbMyJc" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json new file mode 100644 index 000000000..5565838f5 --- /dev/null +++ b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "How do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\nHow do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1832, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/IyjLM-9Eq3c/maxresdefault.jpg", + "title": "James Powell - Cumulative Index Max in pandas | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=IyjLM-9Eq3c" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json new file mode 100644 index 000000000..90d450865 --- /dev/null +++ b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json @@ -0,0 +1,28 @@ +{ + "description": "ChatGPT is a fantastic assistant, but it cannot do everything yet. For example, it cannot automatically manage my calendar, update my to-do list, or do anything that requires it to perform actions. However, what would it take to make this a reality? I decided to put it to the test by allowing ChatGPT to manage my to-do list for me.\n\nDuring this presentation, I will tell how I gave ChatGPT access to my to-do list. Along the way, I will introduce you to the concepts behind LLM-based agents and how they work. Of course, I will also give a demo of the final result. After this demo, we will dive into clever engineering solutions and tricks I discovered to solve problems such as handling hallucinations, parsing actions, etc.\n\nThis talk is for people who want to learn how to build their first LLM-based agent. Familiarity with Python, PyDantic, and LMMs is nice during this presentation but not essential. As long as you love overengineered solutions to a basic to-do list, you will like this presentation.\n\nDuring the presentation, we will discuss things such as:\n- How to give ChatGPT access to your ToDo(ist) list?\n- What are LLM agents?\n- What is the REACT framework?\n- A demo of the agent I built to manage my to-do list.\n- Implementation tips and tricks to make the agent work better.\n\nThe repo can be found here:\ngithub.com/j0rd1smit/todoist_react_agent\n\nBio:\nJordi Smit\nHi! My name is Jordi Smit. I\u2019m deeply passionate about software engineering, data science, and automation. Nothing makes me happier than creating software that helps humans by automating a tedious and manual-intensive part of their job. Therefore, I love discussing data science since this field has opened the door to many new kinds of automation. However, data science solutions often stay stuck at the proof of concept level. To combat this issue, you also need software engineering knowledge. That is why I love the intersection between software engineering, data science, and automation.\n\nI work as a Machine Learning Engineer at Xebia Data in Amsterdam. Here, I help companies to transform their ML-based models into production-ready applications. I love this job because it allows me to explore the intersection between software engineering and data science daily.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1307, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/-rixfb4LWiY/maxresdefault.jpg", + "title": "Jordi Smit - LLM Agents 101: How I Gave ChatGPT Access to My To-Do List | PyData Amsterdam", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=-rixfb4LWiY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json new file mode 100644 index 000000000..06b14ef50 --- /dev/null +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "If you are curious about the field of cryptography and what it has to offer data science and machine learning, this talk is for you! We'll dive into the field of encrypted computation, where decryption isn't needed in order to perform calculations, transformations and operations on the data. You'll learn some of the core mathematical theory behind why and how this works, as well as the differences between approaches like homomorphic encryption and secure multi-party computation. At the end, you'll get some pointers and open-source library hints on where to go next and how to start using encrypted computation for problems you are solving the hard way (or not solving at all).\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1708, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/bEMK2w6e5xY/maxresdefault.jpg", + "title": "Katharine Jarmul - Encrypted Computation: What if decryption wasn't needed? | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=bEMK2w6e5xY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json new file mode 100644 index 000000000..832b15259 --- /dev/null +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Many of us have heard terms like Data for Good, Ethical Machine Learning, Human-Centric Product Design, but those words also bring forward questions -- if we need \"Ethical ML\" what is the rest of machine learning? The current conversation around AI Doom paints a picture where AI goes hand-in-hand with dystopian outcomes. In this keynote, we'll explore what AI could look like if at the core, it was led by these ideals. What if distributed, communal machine learning were a central focus? What if privacy and user choice were a part of our everyday machine learning frameworks? What if aid organizations, governments, coalitions helped shape the problems for AI research? Let's ponder these questions and their outcomes together, imagining AI without the potential for dystopia.\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 2149, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/hUawmdYWAO0/maxresdefault.jpg", + "title": "Katharine Jarmul - Keynote \"AI Without Dystopia\" | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=hUawmdYWAO0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json new file mode 100644 index 000000000..97e2abe14 --- /dev/null +++ b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "This talk will explore the Python tooling and ecosystem for estimating conditional average treatment effects (CATEs) in a Causal Inference setting. Using real world-examples, it will compare and contrast the pros and cons of various existing libraries as well as outline desirable functionalities not currently offered by any public library.\n\nConditional average treatment effects (CATEs) are a fundamental concept in Causal Inference, allowing for the estimation of the effect of a particular treatment or intervention. For CATEs, the effect estimation is not only with respect to an entire population, e.g. all experiment participants, but rather with respect to units, e.g. a single experiment participant, with individual characteristics. This can be very important to meaningfully personalize services and products. In this talk, we will explore the Python tooling and ecosystem for estimating CATEs, including libraries such as EconML and CausalML.\n\nWe will begin by providing an overview of the theory behind CATE estimation, how it fits into the broader field of causal inference and how Machine Learning has recently broken into CATE estimation. We will then dive into the various libraries available for Python, discussing their strengths and weaknesses and providing real-world examples of their usage.\n\nSpecifically, we will cover:\n- EconML: An open-source library for general Causal Inference purposes, by Microsoft Research\n- CausalML: An open-source library for uplift modeling in particular, by Uber\n\nWe will compare and contrast these libraries with respect to CATE estimation, discussing which methods they use, which assumptions they make, and which types of data they are best suited for. We will also provide code examples to illustrate how to use each library in practice. Moreover, we will discuss what we think is missing from both of them.\n\nBy the end of the talk, attendees will have a solid understanding of the Python tooling and ecosystem for estimating CATEs in a causal inference setting. They will know which libraries to use for different types of data and which methods are most appropriate for different scenarios.\n\nThis talk could be particularly relevant for Data Scientists wishing to analyze experiments, such as A/B tests, or trying to derive causal statements from observational, non-experimental data. Participants are not expected to have Causal Inference expertise. Yet, a fundamental understanding of Machine Learning and Probability Theory will be beneficial.\n\n0-5\u2019: Why Causal Inference and why CATE estimation?\n5-10\u2019: What are some conceptual ways of estimating CATEs?\n10-20\u2019: How can we use EconML and CausalML for CATE estimation on a real dataset?\n20-30\u2019: What are we missing from EconML and CausalML?\n\nBio:\nKevin Klein\nKevin is a Data Scientist at QuantCo, working on fraud detection, risk modelling and experimentation. Prior to joining QuantCo, he focused on Natural Language Processing, discrete optimization and Bayesian optimization during his Computer Science major at ETH, Zurich.\nHe's not very original in that he likes functional programming, running and writing.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1340, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/cRS4yZt6OU4/maxresdefault.jpg", + "title": "Kevin Klein - Causal Inference Libraries: What They Do, What I'd Like Them To Do | PD Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=cRS4yZt6OU4" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json new file mode 100644 index 000000000..06dfccde7 --- /dev/null +++ b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "With the rise of data privacy concerns around AI in the EU, how can we innovate using AI capabilities despite regulations around consumer data? What tools and features are available to help us build AI in regulated industries? This talk will discuss how we can leverage diverse datasets to build better AI models without ever having to touch the datasets by using a Python library called Flower.\n\nIn this talk, we\u2019ll review the importance of data privacy concerns, particularly in the EU, and address how we can build AI using sensitive data. We'll discuss a few machine learning techniques (classical, distributed and federated learning), and show how federated learning can help us train AI models without ever touching the sensitive data.\n\nThen, we'll evaluate a few main open source Python packages that help engineers get started with federated learning and why Flower is a valuable option to consider for your next project. We'll review the core features of Flower; most notably, it's ease of use.\n\nAfter that, we\u2019ll jump into a demo and show how, with minimal code, a Python engineer can orchestrate a training job with multiple data sources using federated learning. We\u2019ll walk through different parameters that give engineers the power to control and fine tune the server without the hassle of knowing infrastructure or cloud architecture.\n\nBy the end of this talk, you\u2019ll be able to:\n\nUnderstand the role of federated learning in a landscape with increasing regulation around AI, particularly in the EU with the proposed Artificial Intelligence Act\nDifferentiate between federated learning and classical machine learning\nDesign your project so that it is in compliance with current and future legislation passed on how to use personal data\nBuild and fine tune a server that hosts the model weights for a model trained without seeing personal data\nUnderstand options available to increase the privacy around the data that is used to train the model\nThere will be a link to a Github repo at the end of the talk that contains all the code used in the demo in order to help you get started with your first federated learning project.\n\nBio:\nKrishi Sharma\nKrishi Sharma is a software developer at KUNGFU.AI where she builds software applications that power machine learning models and deliver data for a broad range of services. As a former data scientist and machine learning engineer, she is passionate about building tools that ease the infrastructure dependencies and reduce potential technical debt around handling data. She helped build and maintains an internal Python tool, Potluck, which allows machine learning engineers the ability to bootstrap a containerized, production ready application with data pipelining templates so that her team can focus on the data and metrics without squandering too much time finagling with deployment and software\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1487, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/8njnK_nexEw/maxresdefault.jpg", + "title": "Krishi Sharma - Innovation in the Age of Regulation: Federated Learning with Flower | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=8njnK_nexEw" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json new file mode 100644 index 000000000..a35d8b9e1 --- /dev/null +++ b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "AI won't end the world, but it can and is making life miserable for plenty of folks. Instead of engaging with the AI overlords, let's explore a pragmatic set of design choices that all Data Scientists and ML devs can implement right now, to reduce the risks of deploying AI systems in the real world.\n\nLeave the AI boomers to grumble amongst themselves about x-risk and the singularity. Instead let's focus-in on how we can alleviate the real-world harms happening right now.\n\nToo often attempts to identify risks and respond to failure modes of ML and automated systems dive straight into the specifics of model, stack, and implementation. Or worse, add further impenetrable layers of abstraction - the \"more models, more problems\" syndrome. While it's encouraging to see the ecosystem of explainability tools and ML ops surging, as developers and pragmatists we should always prefer the simplest and cheapest tool in our toolkit which is fit for purpose.\n\nThis talk calls attention to a number of existing simple, cheap and effective levers for flagging and reducing risk that are often overlooked.\n\nThese are software design fundamentals like timely and contextual feedback loops, or graceful degradation, that are easily forgotten in the rush to market. These pragmatic tools and product design choices can immediately improve visibility, safety and reduce reputational risk for any team implementing AI.\n\nP.S. Better oversight and tooling for our current tech will, by definition, improve our chances of being alerted if an existentially risky intelligence did happen to emerge from the silicon ether, one day. So it's a win win, really. \ud83e\udd37\u200d\u2640\ufe0f\n\nBio:\nLaura Summers\nLaura is a Design Engineer and Prodigy Teams Product Lead at Explosion AI.\n\nShe is the founder of Debias AI, (debias.ai) and the human behind Sweet Summer Child Score (summerchild.dev), Ethics Litmus Tests (ethical-litmus.site), fairXiv (fairxiv.org), the Melbourne Fair ML reading group (groups.io/g/fair-ml). Laura is passionate about feminism, digital rights and designing for privacy. She speaks, writes and runs workshops at the intersection of design and technology.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1685, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/n88ZmqsTKig/maxresdefault.jpg", + "title": "Laura Summers - Ok, Doomer | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=n88ZmqsTKig" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json new file mode 100644 index 000000000..ef4ac9a2a --- /dev/null +++ b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Many algorithms for machine learning from time series are based on measuring the distance or similarity between series. The most popular distance measure is dynamic time warping, which attempts to optimally realign two series to compensate for offest. There are many others though. We present an overview of the most popular time series specific distance functions and describe their speed optimised implementations in aeon, a scikit-learn compatible time series machine learning toolkit. We demonstrate their application for clustering, classification and regression on a real world case study and highlight some of the latest distance based time series machine learning tools available in aeon.\n\nThis talk introduces you to popular time series distance functions and demonstrates their usage in exploratory and predictive modelling of time series. Participants will come away with an idea of how to use the very latest research into time series distances for clustering, classification and regression using the aeon toolkit and scikit learn. The talk will be mostly practical and code based, with some algorithmic and mathematical notation.\n\nDistances are used in all forms of time series machine learning. They can help explore collections of time series through clustering, reduce dimensionality by averaging and be used with instance based or kernel based classifiers and regressors. They are used in streaming based anomaly detection and change point detection and have been embedded within tree based ensembles for classification.\n\nThe basic problem in specifying a distance function is to quantify how dissimilar two series are. Elastic distances attempt to compensate for small mis-alignments caused by offset that would make similar series look very different to measures such as Euclidean distance or correlation. There have been many different algorithms that combine forms of time warping (stretching the indexes to realign series) and editing (removing time points from one of the series to improve alignment). In the first part of the talk we will provide a high level overview and visualisation of the differences between these algorithms before describing the aeon toolkit, which contains the most comprehensive and fastest library of elastic distances that we are aware of. aeon distances can be used directly with sklearn distance based algorithms and with the many time series specific algorithms for classification, clustering and regression available in aeon. In the the middle section of the tutorial we will use a real world industrial dataset to demonstrate use cases in clustering, classification and regression. We will end with some pointers to the very latest research into using distance functions. We will require attendees to have a basic knowledge of scikit-learn and standard machine learning algorithms.\n\nThis should appeal to anyone interested in machine learning from time series. It will focus on practical application and algorithm comprehension rather than maths, and will identify the very latest research into algorithm development to suggest further reading. We will provide easy to follow notbooks prior to the talk and all examples will be freely available.\n\nBio:\nTony Bagnall\nTony is a Professor of Computer Science at the University of East Anglia, where he leads the time series machine learning group. His primary research interest is in time series machine learning, with a historic focus on classification, but more recently looking at clustering and regression. He has a side interest in ensemble design.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1484, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/O5cnKAUBKkg/maxresdefault.jpg", + "title": "Lets do the time warp again: time series machine learning with distance functions | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=O5cnKAUBKkg" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json new file mode 100644 index 000000000..6a064d8bb --- /dev/null +++ b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json @@ -0,0 +1,32 @@ +{ + "description": "In the Netherlands a large share of energy is used by industry. By measuring the energy usage of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste. It turns out that in most factories, the biggest source of energy waste comes from idling machines. To be able to give valuable insights and provide relevant alerts to our customers, we set up a machine learning system for standby detection with a \u201chuman in the loop\u201d. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem. No background knowledge is required for this talk.\n\nIn the Netherlands a large share of energy is used by industry (less than 40% compared to only 14% used by households*). Eliminating energy waste in this sector is a big step forward towards a greener future. Therefore, Sensorfact made it its mission to eliminate all industrial energy waste. By measuring the energy usage (electricity or gas) of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste.\n\nIt turns out that in most factories, the biggest source of energy waste comes from forgetting to turn off machines when they are not used. Flagging idling machines based on their electricity usage may seem like a trivial problem at first, however the large variety in machines and production processes makes this a lot harder than you would expect. To be able to give valuable insights on idling machines and provide relevant alerts to our customers, we set up a machine learning system with a \u201chuman in the loop\u201d.\n\nIn many settings it is perfectly fine to embed a machine learning model in a process without any human interference. However, there are cases where it is better to keep a human in the loop. The most obvious use cases are those where there is simply no room for error, for example in medical applications. However, also in less life threatening it can be beneficial to have a human act as gatekeeper ensuring high quality outputs. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem, using the case of standby detection. We will share learnings from our own experience and along the way give you an overview of the (open source) tools we chose to use for the different facets of the project.\n\nNo background knowledge is required for this talk. If you are looking for inspiration on how to build a machine learning system with a human in the loop or if you are curious about sustainability use cases this talk may be interesting for you.\n\n*https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector\n\nBio:\nLieke Kools\nLieke is lead data scientist at Sensorfact, a company aiming to eliminate all industrial energy waste for SME\u2019s. In her role she focusses on the data fueled products that help their consultants to efficiently and effectively give advice to customers. Before joining Sensorfact she worked as a data science consultant at Vantage AI and completed a PhD in econometrics.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1220, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector", + "url": "https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/Hb_yIwkABQU/maxresdefault.jpg", + "title": "Lieke Kools - Standby detection with a human in the loop | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=Hb_yIwkABQU" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json new file mode 100644 index 000000000..40ba51375 --- /dev/null +++ b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json @@ -0,0 +1,28 @@ +{ + "description": "Exploration of the intersection between data, AI, and environmental conservation. In this talk, we will share our experiences and practical insights during our journey trying to develop a system using Python, camera traps and data-driven techniques to help detect poachers in Africa.\n\nIn this storytelling and informative talk, we will delve into our experience of data and AI to monitor wildlife in parks in Africa. Our objective is to provide attendees with a comprehensive understanding of the applications, challenges, and opportunities of leveraging data-driven techniques in environmental conservation.\n\nAudience : individuals interested in leveraging data for positive impact.\n\nThe talk is accessible to a non-technical audience in its story-telling part, but also contains technical parts and details, as well as a live demonstration of the developed and open-sourced solution. Knowledge of Python and cloud infrastructures may be useful.\nTechnologies explored : Python, Node-RED, Streamlit, Google Cloud Platform, Google Vision API, Zamba, Earth Rangers.\n\nBios:\nMa\u00ebl Deschamps\nManager Machine Learning Engineer, I lead the MLOps Expertise in a team of 20+ Data Engineers & Data Scientist. During my time between Shanghai and Amsterdam I explored 15+ project for 10+ clients working in various industries.\nI find great joy in making both my teams and clients happy. I believe in management through empathy and transparency and I'm passionate about Data Sustainability and all its related technical challenges.\nFeel free to reach-out to discuss any of those topics.\n\nSimone Gayed Said\nHello Hello! \ud83c\udf1f I'm Simone, I work as a Machine Learning engineer, and I'm all about using my skills to make a positive impact on the World! \ud83d\ude80\u2728\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1147, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/CSsaGm9eJc0/maxresdefault.jpg", + "title": "Ma\u00ebl Deschamps - Our journey using data and AI to help monitor wildlife in parks in Africa", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=CSsaGm9eJc0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json new file mode 100644 index 000000000..c920e6af5 --- /dev/null +++ b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Deepfakes, a form of synthetic media where a person's image or video is seamlessly replaced using Generative AI like GANs, have recieved significant attention. This talk aims to provide a comprehensive exploration of deepfakes, covering their creation process, positive and negative effects, development pace, and tools for detection. By the end of the presentation, attendees will be equipped with how to create and detect deepfakes, a deep understanding of the technology and its impact.\n\nTalk Outline:\n\nI. How Deepfakes Work (Approx. 8 minutes)\n\nStep-by-step explanation of deepfake creation using an opensource tool\nClarifying the technical aspects behind manipulating existing media with AI algorithms\nII. Deepfakes with GANs (Approx. 8 minutes)\n\nIntroduction to Generative Adversarial Networks (GANs) and their role in deepfake generation\nDifferent types of GANs and how to craft realistic deepfakes\nIII. The Good and the Bad (Approx. 8 minutes)\n\nExploring the positive effects of deepfakes\nUnveiling the negative implications of deepfakes\nReal-world examples highlighting the ethical concerns\nSpeculating on the future developments of deepfake technology\nIV. How to Recognize Deepfakes (Approx. 6 minutes)\n\nInsight into the ongoing efforts to combat the misuse of deepfakes\nVarious approaches and AI-driven tools for detecting deepfake media\nUnderstanding the limitations in detecting increasingly sophisticated deepfakes\nKey Takeaways:\n\nIn-depth understanding of deepfake creation and the role of GANs\nAwareness of the positive and negative impacts of deepfakes in different domains\nReal-world examples illustrating the ethical concerns surrounding deepfakes\nInsights into the future trends and advancements in deepfake technology\nFamiliarity with a range of AI-based approaches and tools for detecting deepfakes\n\nBio:\nMaryam Miradi\nMaryam Miradi is AI and Data Science Lead at Transactie Monitoring Nederland (TMNL). She has a PhD in Artificial Intelligence Deep Learning, specialised in NLP and Computer Vision from Delft University of Technology. The last 15 years, she has developed different AI solutions for Organisations such as Ahold-Delhaize, Belastingdienst, Alliander, Stedin and ABN AMRO\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1677, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/BlVZddcsyV4/maxresdefault.jpg", + "title": "Maryam Miradi - Deep look into Deepfakes: Mastering Creation, Impact, and Detection | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=BlVZddcsyV4" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json new file mode 100644 index 000000000..4a6d2b90c --- /dev/null +++ b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json @@ -0,0 +1,28 @@ +{ + "description": "Recommendation systems shape personalized experiences across various sectors, but evaluating their effectiveness remains a significant challenge. Drawing on experiences from industry leaders such as Booking.com, this talk introduces a robust, practical approach to A/B testing for assessing the quality of recommendation systems. The talk is designed for data scientists, statisticians, and business professionals, offering real-world insights and industry tricks on setting up A/B tests, interpreting results, and circumventing common pitfalls. While basic familiarity with recommendation systems and A/B testing is beneficial, it's not a prerequisite.\n\nThis talk aims to provide attendees with a practical understanding of A/B testing in the evaluation of recommendation systems, including unique insights from industry practices and specific tricks that enhance effectiveness.\n\nMy report includes next steps:\n- Introduction to recommendation systems, their ubiquity, and the imperative for evaluation, including industry examples.\n- An overview of A/B testing and its vital role in assessing recommendation systems, supported by insights from Booking.com and other industry leaders.\n- Techniques for designing effective hypotheses for A/B tests, focusing on recommendation systems.\n- Choosing pertinent metrics for robust evaluation of recommendation systems with industry examples.\n- Conducting A/B tests: industry best practices, common pitfalls, and strategies for mitigation, reinforced by real-world cases.\n- Accurate interpretation of A/B testing results and management of statistical biases, with insights from the field.\nBy the end of the talk, attendees will have a comprehensive understanding of how to apply A/B testing effectively to recommendation systems, select relevant metrics, interpret results accurately, and navigate common challenges, backed by industry best practices and practical examples.\n\nBio:\nIldar Safilo\nMachine Learning Scientist in the Booking.com\nExperienced manager in MLE/DS/SE/DA, I possess extensive expertise in machine learning, analytics, and software engineering. I excel at leading teams to create groundbreaking businesses and delivering innovative solutions for real-world business cases across various industries, including IT, banking, telecommunications, marketplaces, game development, shops, Travel-tech and streaming platforms.\nExpert in building recommendation and ranking systems, as well as personalization automation with machine learning, and advanced A/B testing.\nCo-author and lecturer of a popular online course on recommender system development with over 1000 students.\nCo-author an open-source Python library called RecTools, specifically designed for building recommender systems. The library is hosted on GitHub at RecTools and has received widespread recognition and adoption in the industry.\nGraduate with a Master\u2019s degree in Mathematics and Computer Science and over 6 years of experience in data science.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1090, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/cQJfYtTfJQg/maxresdefault.jpg", + "title": "Mastering Recommendation Systems Evaluation: An A/B Testing Approach with Insights from the Industry", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=cQJfYtTfJQg" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json new file mode 100644 index 000000000..42e085c60 --- /dev/null +++ b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "DuckDB is a novel analytical data management system. DuckDB supports complex queries, has no external dependencies, and is deeply integrated into the Python ecosystem. Because DuckDB runs in the same process, no serialization or socket communication has to occur, making data transfer virtually instantaneous. For example, DuckDB can directly query Pandas data frames faster than Pandas itself. In our talk, we will describe the user values of DuckDB, and how it can be used to improve their day-to-day lives through automatic parallelization, efficient operators and out-of-core operations.\n\nData management systems and data analysts have a troubled relationship: Common systems such as Postgres or Spark are unwieldy, hard to set up and maintain, hard to transfer data in and out, and hard to integrate into complex end-to-end workflows. As a response, analysts have developed their own ecosystem of data wrangling tools such as Pandas or Polars. These tools are much more natural for analysts to use, but are limited in the amount of data they can process or the amount of automatic optimization that is supported.\n\nDuckDB is a new analytical data management system that is built for an in-process use case. DuckDB speaks SQL, has no external dependencies, and is deeply integrated into the Python ecosystem. DuckDB is Free and Open Source software under the MIT license. DuckDB uses state-of-the art query processing techniques with vectorized execution, lightweight compression, and morsel-driven automatic parallelism. DuckDB is out-of-core capable, meaning that it is capable of not only reading datasets that are bigger than main memory. This allows for analysis of far greater datasets and in many cases removes the need to run separate infrastructure.\n\nThe \u201cduckdb\u201d Python package is not a client to the DuckDB system, it provides the entire database engine. DuckDB runs without any external server directly inside the Python process. Once there, DuckDB can run complex SQL queries on data frames in Pandas, Polars or PyArrow formats out-of-the box. DuckDB can also directly ingest files in Parquet, CSV or JSON formats. Because DuckDB runs in the same process, data transfer are virtually instantaneous. Conversely, DuckDB\u2019s query results can be transferred back into data frames very cheaply, allowing direct integration with complex downstream libraries such as PyTorch or TensorFlow.\n\nDuckDB enjoys fast-growing popularity, the Python package alone is currently downloaded around one million times a month. DuckDB has recently become the default backend of the Ibis project that offers a consistent interface in Python over a variety of data backends.\n\nThis talk is aimed at two main groups, data analysts and data engineers. For the analysts, we will explain the user values of DuckDB, and how it can be used to improve their day-to-day lives. For data engineers, we will describe DuckDB\u2019s capabilities to become part of large automated data pipelines. The presenters for the proposed talk, Hannes M\u00fchleisen and Mark Raasveldt are the original creators of DuckDB, they are still leading the project and are deeply familiar with its Python integration.\n\nBios:\nHannes M\u00fchleisen\nProf. Dr. Hannes M\u00fchleisen is a creator of the DuckDB database management system and Co-founder and CEO of DuckDB Labs, a consulting company providing services around DuckDB. He is also a senior researcher of the Database Architectures group at the Centrum Wiskunde & Informatica (CWI), the Dutch national research lab for Mathematics and Computer Science in Amsterdam. Hannes is also Professor of Data Engineering at Radboud Universiteit Nijmegen. His' main interest is analytical data management systems.\n\nMark Raasveldt\nCTO at DuckDB Labs\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1392, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/5ddoZR6PYNU/maxresdefault.jpg", + "title": "M\u00fchleisen & Raasveldt - In-Process Analytical Data Management with DuckDB | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=5ddoZR6PYNU" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json new file mode 100644 index 000000000..0211bbe07 --- /dev/null +++ b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Every news consumer has needs and in order to build a true bond with your customer it is vital to meet these, sometimes, diverse needs. To achieve this, first of all, it is important to identify the overarching needs of users; the reason why they read news. The BBC conducted research to determine these needs and identified six distinct categories: Update me, Keep me on trend, Give me perspective, Educate me, Divert me, and Inspire me. Their research showed that an equal distribution of content across these user needs will lead to higher customer engagement and loyalty. To apply this concept within DPG Media, we started building our own user needs model. Through various iterations of text labelling, text preparation, model building, fine-tuning and evaluation, we have arrived at a BERT model that is capable of determining the associated user needs based solely on the article text.\n\nWe would like to take the audience through all the steps that we have taken to get to the point where we are right now. During this process we had to find solutions to many obstacles and we are happy to share these lessons with the audience. Furthermore, we want to discuss all the tools and techniques that we used in order to arrive at the current phase.\n\nThe focus of the talk is on preparing the datasets and building the models, so a background in data science, engineering and/or machine learning is usefull.\n\nThe time breakdown will be the following:\nMinutes 0-5: introducing the topic and explaining why it is important\nMinutes 5-10: discussing the tools that we used and prior decisions we made\nMinutes 10-20: going through the labelling process and different models we build\nMinutes 20-25: sharing results and lessons learnt\nMinutes 25-30: giving insights into next steps and future applications\n\nBios:\nJurriaan Nagelkerke\nData Scientist with 15+ years experience in getting value out of data for various companies in different branches. Love to apply the right ML/ AI techniques to answer business questions and actually make a difference. Aside from hands on consultant i'm also trainer in various ML techniques. Last few years strong focus on textual data / NLP and transformer models / LLMs.\n\nVincent Smeets\nHi, my name is Vincent Smeets. I am one of the data scientists within the Data And Customer Analytics department at DPG Media. I am responsible for generating insights from structured and semi-structured data to support decision making within the B2C Marketing organisation. In my freetime I love skateboarding, tennis and running.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1335, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/hLbYLP1XCfI/maxresdefault.jpg", + "title": "Nagelkerke & Smeets - Revealing the true motives of news readers | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=hLbYLP1XCfI" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json new file mode 100644 index 000000000..546c3ba4e --- /dev/null +++ b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "In this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nIn this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nBio:\nOkke van der Wal\nLeading the Payments Machine Learning team at Uber working on Anomaly Detection, Personalization & Fraud Detection within the Onboarding and Checkout experiences at Uber using Contextual Bandits, Uplift Modelling & Reinforcement Learning.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1316, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/c_dOpCvkNc0/maxresdefault.jpg", + "title": "Okke van der Wal - Personalization at Uber scale via causal-driven machine learning | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=c_dOpCvkNc0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json b/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json new file mode 100644 index 000000000..78705d58c --- /dev/null +++ b/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json @@ -0,0 +1,28 @@ +{ + "description": "Opening Notes presented by Leah Silen, Executive Director of NumFOCUS.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1016, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/_nlryVPWTGM/maxresdefault.jpg", + "title": "PyData Amsterdam 2023 - Opening Notes", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=_nlryVPWTGM" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json new file mode 100644 index 000000000..8d27c1d5b --- /dev/null +++ b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Working on ML serving for couple of years we learned a lot. I would like to share a set of best practices / learnings with the community\n\nAt Adyen we deploy a lot of models for online inference in the payment flow. Working in the MLOps team to streamline this process, I learned a lot about best practices / things to consider before (after) putting a model online. These are small things but they do contribute to a production and reliable setup for online inference. Some examples:\n\nAdding meta data & creating a self contained archive\nSeparating serving sources from training sources\nChoosing the requirements of model\nAdding an example input & output request\nAdding schemas for input and output\nCommon issues when putting models online: memory leaks, concurrency\nWhich server is best? Process based or thread based\nHow different python versions affect inference (execution) time\n\nBio:\nZiad Al Moubayed\nStaff Engineer @ Adyen. I am passionate about high performance distributed systems. Recently I was working on scaling Adyen's Data & ML infrastructure.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1436, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/KVEULeK1zk4/maxresdefault.jpg", + "title": "Reliable and Scalable ML Serving: Best Practices for Online Model Deployment | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=KVEULeK1zk4" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json new file mode 100644 index 000000000..41e2034bf --- /dev/null +++ b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Bored of old pipeline orchestrator? Difficult to understand if data is up-to-date? Trouble with development workflow of data pipeline?\nDagster, an open-source tool, offers a unique paradigm that simplifies the orchestration and management of data pipelines.\nBy adopting declarative principles, data engineers and data scientists can build scalable, maintainable, and reliable pipelines effortlessly.\nWe will commence with an introduction to Dagster, covering its fundamental concepts to ensure a comprehensive understanding of the material.\nSubsequently, we will explore practical scenarios and use cases, with also DBT for empower the power of SQL language.\n\nMinutes 0-5: Explain the design pattern problem of actual data pipeline framework.\nMinutes 5-15: Introduction to Dagster and its core concepts.\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, with also DBT, the power of gRPC server.\nMinutes 25-30: Q&A and conclusion.\n\nAre you tired of struggling with outdated pipeline orchestrators? Do you find it challenging to ensure your data is always up-to-date? Are you facing difficulties with the development workflow of your data pipeline?\n\nIn this session, we will introduce Dagster, an open-source tool that revolutionizes the orchestration and management of data pipelines. By embracing declarative principles, data engineers and data scientists can effortlessly build scalable, maintainable, and reliable pipelines.\n\nWe will begin by providing an overview of the design pattern problem that many existing data pipeline frameworks face. Understanding the limitations of these frameworks will set the stage for exploring the transformative capabilities of Dagster\n\nNext, we will delve into the core concepts of Dagster, ensuring a comprehensive understanding of the material. You will learn how Dagster simplifies pipeline development and execution by providing a declarative and intuitive approach. Through practical examples and hands-on demonstrations, we will showcase how you can leverage Dagster to build powerful data pipelines.\n\nBut that's not all! We will also explore the integration of DBT, empowering you to harness the full potential of the SQL language within your data pipelines. You will witness the synergy between Dagster and DBT, unlocking new possibilities for data manipulation and transformation.\n\nBy the end, you'll be equipped with the knowledge and inspiration to elevate your data pipeline workflows to new heights.\n\nOutline:\n\nMinutes 0-5: Understanding the design pattern problem of existing data pipeline frameworks\nMinutes 5-15: Introduction to Dagster and its core concepts\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, including the integration with DBT and the power of gRPC server\nMinutes 25-30: Q&A and conclusion\n\nBio:\nRiccardo Amadio\nSenior Data Engineer at Agile Lab with a background of Data Scientist and Software Engineer.\nWhen I don't work with data pipelines , I juggle between closing some of my 100+ open tabs on the browser and my true passion: collecting stars on GitHub \ud83d\udd2d\ud83c\udf1f. In this treasure trove of more than 2,000 repositories, I am pretty sure I can find any tool to solve a problem, and I can\u2019t wait to share them with you.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1300, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/VilruuAAwp8/maxresdefault.jpg", + "title": "Riccardo Amadio | Declarative data manipulation pipeline with Dagster | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=VilruuAAwp8" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json new file mode 100644 index 000000000..56fcf21d1 --- /dev/null +++ b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Join us as we explore the complexities of balancing the electricity grid amidst the rise of renewable energy sources. We\u2019ll discover the challenges in forecasting electricity consumption from diverse industrial resources and the modelling techniques employed by Sympower to achieve accurate forecasts. Gain insights into the trade-offs involved in aggregating data at different hierarchical levels in time series forecasting.\n\nThe shift to renewable energy sources presents a major challenge for the electricity grid: solar and wind facilities are constantly varying in power output, making it harder to keep the supply and demand in balance. This creates a need for demand response: strategic activation or deactivation of large industrial resources to balance the electricity grid. Reliable demand response requires an accurate forecast of industrial electricity consumption, to get a clear understanding of which resources can be controlled at what time.\n\nIn this talk we will discuss the challenges faced when forecasting electricity consumption from industrial resources from different kinds of industries such as furnaces, greenhouses or paper mills. We\u2019ll discuss the different modelling approaches for predicting time series including regression, forecasting and deep learning, and we will discuss the suitability of each in different scenarios. Using the forecasting of electricity consumption of industrial resources as an example, we show how we make our forecasts at Sympower to help balance the electricity grid.\n\nFinally we will discuss a trade-off in forecasting: Trends and seasonality often only emerge at aggregate levels, making forecasting at the aggregate level easier. On the other hand, business often requires precision-level insights. Aggregate data is inherently less noisy since the errors tend to cancel out, but also might fail to capture lower-level details. We will discuss the considerations to make when forecasting at different aggregated levels in time or across groups, and what you could do to forecast consistently across different aggregate levels..\n\nKEY TAKEAWAYS\n- Gain insights into selecting the most suitable modelling technique for your forecasting need\n- Understand the challenges posed by the evolving electricity grid and the significance of demand response\n- Explore the trade-offs involved in aggregating data at different hierarchical or temporal levels in time series forecasting\n\nBio:\nRik van der Vlist\nRik is a machine learning engineer with a strong foundation in electrical engineering and a specialization in leveraging electricity data for smart use cases. With previous experience at Eneco, he has focused on delivering automated home energy insights to large group of customers. Currently, Rik is dedicated to constructing a scalable forecasting model for a sustainable electricity grid, combining his passion for data science and sustainable solutions. He thrives on creating value and generating insights from raw data, demonstrating his proficiency in building robust and scalable data pipelines using Spark and Python.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1472, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/30x-TYxZ6QY/maxresdefault.jpg", + "title": "Rik van der Vlist - Balancing the electricity grid with multi-level forecasting models | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=30x-TYxZ6QY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json new file mode 100644 index 000000000..85cbf5880 --- /dev/null +++ b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json @@ -0,0 +1,28 @@ +{ + "description": "Data science, IT and software development become more and more complex and are subject to increasing requirements and fast-paced business demand. Higher complexity, higher pace and higher quality requirements result in more pressure on our fellow data engineers and data scientists.\n\nMore pressure, but are we resilient enough to withstand that increasing pressure? You have probably already seen its outcome. Unhappiness, stress or even burn-outs of co-workers, instead of creating cool code, great solutions and building a better world using your skills.\n\nHow to change the pressure and stress you perceive as a data scientist, data engineer of ML-engineer? How to ensure that your brain\u2019s frontal lobe returns to a problem solving and decision-making state?\n\nTarget audience\nAll experience levels data engineers, data scientists and analysts. For those who start hitting do\u2019s, don\u2019ts and other hard walls in real life companies and projects. Especially if you experience a drain of energy and focus from those pressure and constrains. Senior or junior, there is much to learn and experience.\n\nTakeaway\nLearn and experience 3 great tools to change your resilience instantly and consistently towards pressure and stress. Not just for yourself, but also be able to see and assist co-workers, family, or other loved ones if they experience stress.\n\nBackground knowledge needed\nNone. Just be sure to bring both your head and body to this workshop to experience how quickly these tools work for you.\n\nTime\n\u2022 0 \u2013 5 Intro and experience tool #1\n\u2022 5 \u2013 15 Control your nervous system and work-related stress\n\u2022 15 \u2013 20 experience tool #2\n\u2022 20 \u2013 25 Your stress and social states (based on polyvagal theory)\n\u2022 25 \u2013 30 experience tool #3\n\nBio:\nMaarten Oude Rikmanspoel\nI love working with both technology and people. Currently working as a freelance data engineer and business intelligence specialist to satisfy the tech part of my heart. Fell in love with Python and the PyData modules in 2017 after unsuccessful relationships with Java and C++ in the past. Applying this in a variety of industries and companies.\n\nIn parallel, I\u2019m creating CalmCode.nl for the past 1,5 years with the aim of guiding software developers, IT- and data specialists towards less stress and burnouts. I\u2019ve seen to many bad examples in the larger companies and multi-nationals where developers almost looked as being oppressed instead of being able to do their work properly and in a nice environment. So the people-oriented part of my heart get\u2019s fuelled when I see people grow and being able to take control of their lives again.\n\n\u201cWe\u2019re all just walking each other home.\u201d Ram Dass\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1716, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/xhvJa7ETy2E/maxresdefault.jpg", + "title": "Rikmanspoel - import full-focus as ff \u2013 How to reduce stress and pressure as a data specialist", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=xhvJa7ETy2E" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json new file mode 100644 index 000000000..e4ba36600 --- /dev/null +++ b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "This talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nPolars expressions are a DSL to a very powerful vectorized engine. They make it very easy to write parallel, efficient and readable code.\n\nThis talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nBio:\nRitchie Vink\nRitchie Vink is the author of the Polars query engine/ DataFrame library and the CEO/Co-Founder of Polars the company.\nOriginally he has a background in Civil Engineering, but he switched fields and has most work experience in Machine learning and software development. Though what truly matters in experience is what he did in his side-projects.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1468, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/NJbBWDzZuWs/maxresdefault.jpg", + "title": "Ritchie Vink - Polars and a peek into the expression engine | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=NJbBWDzZuWs" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json new file mode 100644 index 000000000..6ef2fc324 --- /dev/null +++ b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json @@ -0,0 +1,28 @@ +{ + "description": "For many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\nFor many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 2262, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/kMfl5SzfkVc/maxresdefault.jpg", + "title": "Robert Erdmann - Keynote - Python for Imaging and Artificial Intelligence in Cultural Heritage", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=kMfl5SzfkVc" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json new file mode 100644 index 000000000..fb5741639 --- /dev/null +++ b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Maintaining on-premise clusters poses quite a few challenges. One of these challenges is achieving developer autonomy, where developers can deploy applications themselves. This talk will cover how we set up Kubernetes to achieve exactly that.\n\nAs your datasets are growing, and you gain more use-cases, so do the number of required tools and applications. Where in the past a data cluster consisted of just HDFS, Spark, Airflow and Postgres, you now need OLAP databases, distributed query engines, parallel-computing for your model training and much more. All of this puts a lot of pressure on the infrastructure team responsible to install & maintain all the tools on your platform. By introducing Kubernetes, we change that responsibility to just maintaining HDFS and Kubernetes, and move the responsibility of maintaining and introducing the data tools to the data (platform) engineers.\n\nIn this talk we will cover how we achieved developer autonomy by touching the following subjects;\n- What is the first step to installing Kubernetes on premise?\n- How do we deploy changes automatically?\n- How do we make an experimentation friendly environment for developers while remaining secure?\n- How do we handle secrets to connect different applications together? A\n- Finally, some lessons learned from the migration process.\n\nBio:\nJorrick Sleijster\nJorrick is a Data Platform Engineer at Adyen. With a background in computer science his focus has been on introducing and maintaining tools on the data platform. On the side Jorrick is an active open-source contributor to pet projects and Apache Airflow. One of the contributions was awarded with PR-of-the-month of the Apache Airflow project.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1527, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/kgkZmk4EvWA/maxresdefault.jpg", + "title": "Sleijster - Achieving developer autonomy on on-premise data clusters using Kubernetes | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=kgkZmk4EvWA" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json new file mode 100644 index 000000000..aaf9d3137 --- /dev/null +++ b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json @@ -0,0 +1,28 @@ +{ + "description": "The Difference-in-Differences (DiD) methodology is a popular causal inference method utilized by leading tech firms such as Microsoft Research, LinkedIn, Meta, and Uber. Yet recent studies suggest that traditional DiD methods may have significant limitations when treatment timings differ. An effective alternative is the implementation of the staggered DiD design. We exemplify this by investigating an interesting question in the music industry: Does featuring a song in TV shows influence its popularity, and are there specific factors that could moderate this impact?\n\nDifference-in-differences (DiD) is a causal inference method frequently used in empirical research in industry and academia. However, standard DiD has limitations when interventions occur at different times or affect varying groups. This talk will highlight the application of the Staggered DiD method, a more nuanced approach that addresses these limitations, in the context of the music industry. We will try to answer the question of how music features in TV shows affect music popularity and how this effect might change for different types of music using the staggered DiD method. Attendees will gain an understanding of causal inference through observational studies and specifically how the new DiD methods are used through an interesting and original case study.\n\nThe talk will be structured as follows:\n\nIntro to the case (e.g., background on music features on TV, dataset)\nExplanation of the DiD approach and its limitations.\nIntroduction to the Staggered DiD method.\nApplication of staggered DiD for the case study from the music industry\nConclusions\nQ&A\nTarget Audience: The talk would be beneficial for data scientists, researchers, and practitioners interested in causal inference, marketing analytics, and quasi-experimental design. Attendees should have a basic understanding of statistical methods used in data science.\n\nKey Takeaways:\n\nUnderstanding of the DiD approach and its limitations in the context of analyses with observational data.\nInsights into the Staggered DiD method and its application.\nPractical knowledge about executing and evaluating DiD studies effectively.\n\nBio:\nNazli M. Alagoz\nI am a quantitative researcher and data scientist with a strong background in marketing, economics, and econometrics. My focus is on using data-driven approaches to tackle complex business challenges, uncover valuable insights, and drive impactful decisions. As a Ph.D. candidate in quantitative marketing, I specialize in causal inference, machine learning, and experimental design to address cutting-edge research questions.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1278, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/rkP4TK1SPVQ/maxresdefault.jpg", + "title": "Staggered Difference-in-Differences in Practice: Causal Insights from the Music Industry | PDAMS 23", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=rkP4TK1SPVQ" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json new file mode 100644 index 000000000..e585ce175 --- /dev/null +++ b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json @@ -0,0 +1,28 @@ +{ + "description": "The customers of Picnic use images and texts of products to decide if they like our products, so why not include those data streams in our Temporal Fusion Transformers that we use for Product Demand Forecasting?\n\nJoin us for a thrilling journey through convolutional, graph-based, and transformer-based architectures. Learn about methods to turn images, texts, and geographical information into features for other applications as we did for product demand forecasting. Discover how Picnic Technologies uses state-of-the-art multimodal approaches for demand forecasting to prevent food waste and keep our customers happy!\n\nEver wondered how we keep your favorite brand of potato chips in stock, while that exotic sauce is forever \"currently unavailable\"? We'll reveal the secrets behind these mysteries in our talk on how we are using recent advancements in visual, textual, and contextual information processing techniques to optimize our Product Demand Forecasting. Because everybody loves looking at pictures of groceries but prefers having them available and on their doorstep (delivered for free).\n\nWe begin by shedding light on traditional product demand forecasting - the 'old potatoes' of the industry - and its limitations, like the notorious cold start problem and category dynamics.\n\nOur talk is a must-watch for data scientists, product managers, supply chain wizards, and anyone who has ever been curious about the new innovations in number-crunching that gets your favorite snack from the factory to your front door. If you're in the e-commerce or retail industries, this talk will be as essential as oatmilk and bread in a shopping list. Don\u2019t worry if words like multimodal, temporal, and fusion sound intimidating; They will be explained in a way that is informative and entertaining if you have seen them before but also if you have not.\n\nWe promise it\u2019s not all graphs and matrices \u2013 expect an unexpected rollercoaster ride through the aisle of our digital store. With each turn, you'll discover how our multimodal method uses product images, textual descriptions, and additional contextual information to predict if potatoes will overtake pasta in popularity next month. We'll show you the \u2018cart\u2019 loads of data behind these predictions, putting a fun spin on the world of groceries.\n\nIn the grand finale, we\u2019ll take you behind the scenes of our model's showdown with traditional methods. Spoiler alert: our method doesn\u2019t just predict demand; it leaves the traditional methods looking like overripe bananas in the back of the fridge (which is a bad state for bananas to be in).\n\nThe main takeaway from our talk - besides a craving for potatoes - will be an understanding of multimodal demand forecasting and how all these different types of data are becoming easier and easier to use for real-world business value. By the end of our talk, you'll be filled with ideas (and the sudden need to do groceries with Picnic, you are our target audience: Loving reliability, good products and you have busy jobs), inspired by the potential of multimodal machine learning in forecasting. So, whether you're a data scientist, product manager, or a curious shopper, come along for an enjoyable trip through the world of groceries and demand forecasting!\n\nPrepare your shopping list and join us. Just remember, our model may predict the demand for potatoes, but it's still up to you to remember the dip!\n\nBio:\nMaarten Sukel\nMaarten is a Data Scientist working at Picnic Technologies working mostly on Demand Forecasting and running machine learning at scale. Meanwhile at the University of Amsterdam, he works on research into the use of multimodal approaches for a range of applications.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1165, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/ZZLQ1KJLRYE/maxresdefault.jpg", + "title": "Sukel - Multimodal Product Demand Forecasting: From pixels on your screen to a meal on your plate", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=ZZLQ1KJLRYE" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json new file mode 100644 index 000000000..318b2176c --- /dev/null +++ b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json @@ -0,0 +1,28 @@ +{ + "description": "Until a few years ago, data science & engineering at Booking.com had grown largely in an ad-hoc manner. This growth has led to a labyrinth of unrelated scripts representing Extract-Transform-Load (ETL) processes. Without options for quickly testing cross-application interfaces, maintenance and contribution grew unwieldy, and debugging in production was a common practice.\n\nOver the past several years, we\u2019ve spearheaded a transition from isolated workflows to a well-structured community-maintained monorepo - a task that required not just technical adaptation, but also a cultural shift.\n\nCentral to this transformation is the adoption of the concept of \"tables as code\", an approach that has changed the way we write ETL. Our lightweight PySpark extension represents table metadata as a Python class, exposing data to code, and enabling efficient unit test setup and validation.\n\nIn this talk, we walk you through \u201ctables as code\u201d design and complementary tools such as efficient unit testing, robust telemetry, and automated builds using Bazel. Moreover, we will cover the transformation process, including enabling people with non-engineering backgrounds to create fully tested and maintainable ETL. This includes internal training, maintainers, and support strategies aimed at fostering a community knowledgeable in best practices.\n\nThis talk is aimed at ETL-adjacent data science practitioners, ideally who have been wondering how to push code quality forward at a data-centric organization.\n\nIntroduction (0-5 minutes): We begin by shedding light on the infrastructure that hosted the old scripts, and discuss our motivation for change. It\u2019s worth mentioning that this transformative decision emerged from individual product teams, not from an executive mandate.\nTables as Code (10 minutes): We'll then introduce the concept of 'tables as code', detailing how this approach enables efficient testing.\nMonorepo Transformation (10 minutes): Building on this foundation, we'll explore how 'tables as code' grew into a vast monorepo with thousands of tests. We'll discuss how we scaled our processes and nurtured this project as a community effort.\nCommunity Growth and Future Plans (5 minutes): In our closing segment, we'll share insights gained from growing this project as a community, highlight strategies for orchestrating training, community support, and finally, share our future plans both within and outside our organization.\n\nBios:\nBram van den Akker\nBram van den Akker is a Senior Machine Learning Scientist at Booking.com with a background in Computer Science and Artificial Intelligence from the University of Amsterdam. At Booking.com, Bram has been one of the founders of bkng-data, an internal collection of Python tools aimed at improving code quality, testing, and streamlining CI/CD for data practitioners.\nAside from bkng-data, Bram's work focuses on bridging the gap between applied research and practical requirements for Bandit Feedback all across Booking.com. Previously, Bram has held positions at Shopify, Panasonic & Eagle Eye Networks, and has peer reviewed contributions and tutorials to conferences and workshops such as TheWebConf (WWW), RecSys, and KDD, including a best-paper award.\n\n\nJon Smith\nJon Smith is a Senior Machine Learning Scientist at Booking.com, having spent his time working in fraud detection and performance marketing. In these areas, he focusses on strengthening software practices within critical ML systems, through evangelising code quality and unit testing.\nHe studied Mathematics and Computer Science at Acadia University and Simon Fraser University in Canada, and spent some time as a Machine Learning Engineer at the Canadian Broadcasting Corporation.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1464, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/gTARHyGrcq0/maxresdefault.jpg", + "title": "Tables as Code: The Journey from Ad-hoc Scripts to Maintainable ETL Workflows at Booking.com", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=gTARHyGrcq0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json new file mode 100644 index 000000000..896eef281 --- /dev/null +++ b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Numerous packages exist within the Python open-source ecosystem for algorithm building and data visualization. However, a significant challenge persists, with over 85% of Data Science Pilots failing to transition to the production stage.\n\nThis talk introduces Taipy, an open-source Python library for front-end and back-end development. It enables Data Scientists and Python Developers to create pilots and production-ready applications for end-users.\n\nIts syntax facilitates the creation of interactive, customizable, and multi-page dashboards with augmented Markdown. Without the need for web development expertise (no CSS or HTML), users can generate highly interactive interfaces.\n\nAdditionally, Taipy is engineered to construct robust and tailored data-driven back-end applications. Intuitive components like pipelines and data flow orchestration empower users to organize and manage data effectively. Taipy also introduces a unique Scenario Management functionality, facilitating \"what-if\" analysis for data scientists and end-users.\n\nDuring this talk, we will showcase the capabilities of Taipy:\n- to create highly-interactive applications easily without any knowledge in web development.\n- to fill a void within the standard Python back-end stack, offering a powerful solution for data-driven applications.\n\nBios:\nFlorian Jacta\n-Specialist of Taipy, a low-code open-source Python package enabling Python developers to develop a production-ready AI application quickly. Package pre-sales and after-sales function.\n-Data Scientist for Groupe Les Mousquetaires (Intermarche) and ATOS.\n-Developed several Predictive Models as part of strategic AI projects.\n-Master in Applied Mathematics from INSA, Major in Data Science and Mathematical Optimization.\n\nAlexandre Sajus\nAlex worked in Amazon Business Intelligence. He graduated with a Master of Engineering at CentraleSup\u00e9lec - Paris-Saclay University and joined Taipy as a Community Success Consultant. His primary skills are MLOps, Machine Learning, Data Engineering, and Python.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1700, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/REYDT6FYHAc/maxresdefault.jpg", + "title": "Turning your Data/AI algorithms into full web applications in no time with Taipy | PDAMS 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=REYDT6FYHAc" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json new file mode 100644 index 000000000..299f68250 --- /dev/null +++ b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json @@ -0,0 +1,28 @@ +{ + "description": "In this talk, we would like to introduce you to the urban challenges that the City of Amsterdam is trying to solve using AI. We will walk you through the technical details behind one of our projects and invite you to join us in the ethical development of cool AI applications for social good.\n\nThe City of Amsterdam has the mission of promoting the development of artificial intelligence to improve the lives of Amsterdam\u2019s residents. We conduct cutting-edge research into the analysis of text, images, and point cloud data, all with the aim of solving the urban challenges of our generation and the ones to come.\n\nRecently, we\u2019ve been working on making our city more inclusive by mapping accessibility infrastructure in the public space. We\u2019ve been also working on making the city safer by localizing all street lights and automatically extracting some of their characteristics. Finally, our analysis of trees and greenery in the city can help increase the city's biodiversity and also help us reach our climate goals.\n\nWorking in the public sector means that technology itself is only a part of our job. On a daily basis, we also need to ensure that all development is done according to our city\u2019s values \u2013 for example, that applications benefit everyone, that we are open and transparent, and that we give citizens a say in shaping their (digital) city. This means (at the very least) that open-source development and the publication of methodology, data, and insights for all of our algorithms are an inseparable part of work.\n\nIn this talk, we would like to introduce you to the challenges that we face, walk you through the technical details behind one of our projects, and share the related open-source materials that can be reused by the PyData community. Finally, we hope to inspire you to join us in the ethical development of cool AI applications for social good.\n\nBios:\nShayla Jansen\nShayla is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team which aims to improve the livability of Amsterdam by bringing AI research to the city.\n\nNiek IJzerman\nNiek is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team. Niek is a recent graduate from the MSc AI program at the UvA and currently focusses on automated asset management in 3D using AI and Data Science.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1220, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/fch2WAEyyc0/maxresdefault.jpg", + "title": "Using AI to make Amsterdam greener, safer, and more accessible", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=fch2WAEyyc0" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json new file mode 100644 index 000000000..60341ff60 --- /dev/null +++ b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json @@ -0,0 +1,32 @@ +{ + "description": "Last year, the pandas community adopted a new process for making significant changes to the library: the Pandas Enhancement Proposals, aka PDEPs (similar to Python's PEPs and numpy's NEPs, ..). In the meantime, several of those proposals have been proposed and discussed, and some already accepted, shaping up the pandas roadmap (https://pandas.pydata.org/about/roadmap.html).\n\nThe goal of this talk is to introduce you to this new process, and give an overview of a few of the proposed PDEPs. This way, you will learn about some of the behavioural changes you can expect as a pandas user in the near future.\n\nOver the many years of development, pandas has grown (or kept since the early days) quite some corner cases and inconsistencies. Some of the proposed PDEPs are an attempt to tackle those? For example, one accepted proposal is to ban any (up)casting in \"setitem-like\" operations, avoiding surprising data type changes. There is also a proposal to stop providing the inplace option for many methods, because even though the name might imply otherwise, those operations were not actually done in-place. Another major change that is under way is a change to the copy and view semantics of operations in pandas (related to the well-known (or hated) SettingWithCopyWarning). This is already available as an experimental opt-in to test and use the new behaviour, and will probably be a highlight of pandas 3.0.\n\nBio:\nJoris Van den Bossche\nI am a core contributor to Pandas and Apache Arrow, and maintainer of GeoPandas. I did a PhD at Ghent University and VITO in air quality research and worked at the Paris-Saclay Center for Data Science. Currently, I work at Voltron Data, contributing to Apache Arrow, and am a freelance teacher of python (pandas) at Ghent University.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1608, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + }, + { + "label": "https://pandas.pydata.org/about/roadmap.html", + "url": "https://pandas.pydata.org/about/roadmap.html" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/z47QwqDUKTo/maxresdefault.jpg", + "title": "Van den Bossche - What the PDEP? An overview of some upcoming pandas changes | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=z47QwqDUKTo" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json new file mode 100644 index 000000000..c548f172b --- /dev/null +++ b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "What can we learn from engineering, the history of machine learning, fantasy books, the early 1990s internet, and art history about how to be successful engineers in the modern-day data landscape ? We\u2019ll learn together in this talk.\n\nBios:\nVicki Boykis\nVicki Boykis works on end-to-end ML applications. Her interests include the intersection of information retrieval and large language models, applying engineering best practices to machine learning, and Nutella. She works at Duo Security and she lives in Philadelphia with her family. Her favorite hobby was making terrible jokes on Twitter when it was still good. She recently wrote a deep dive on embeddings and put together Normconf, celebrating normcore workflows in ML.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 2276, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/icGHT7MiaFY/maxresdefault.jpg", + "title": "Vicki Boykis - Keynote \"Build and keep your context window\" | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=icGHT7MiaFY" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json new file mode 100644 index 000000000..008f80693 --- /dev/null +++ b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json @@ -0,0 +1,28 @@ +{ + "description": "In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future\n\nI've met a lot of authoritative people in my field who pass out advise that sounds like this:\n\nWorking on recommenders? Collect all the data! Sessions!\nWorking on text classification? That's a solved problem! Bert!\nWorking with embeddings? There's a library for that already!\nWorking on tabular data? XGBoost for the win! GridSearch!\nIn short: \"this is how you do data science, don't go and reinvent the wheel\".\n\nIf you spend 5 minutes thinking about \"the invention of the wheel\" though, then you may start to rethink. After all: the wheels on a bike are different from the wheels on an airplane, just like the wheels of a tractor. And for Pete's sake: that's a good thing! If we hadn't reinvented those wheels, we're be stuck with wooden horse carts.\n\nSo ... what might happen if we take the time to rethink a few things?\n\nSpecifically, this keynote will discuss the following topics:\n\ntext classification\nfraud detection\nproduct recommenders\nactive learning\nembeddings\nI hope you'll join me for some new ideas as well as some live demos.\n\nBio:\nVincent Warmerdam\nVincent D. Warmerdam is a software developer and senior data person. He\u2019s currently works over at Explosion to work on data quality tools for developers. He\u2019s also known for creating calmcode.io as well as a bunch of open source projects. You can check out his blog over at koaning.io to learn more about those.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 2818, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/C9p7suS-NGk/maxresdefault.jpg", + "title": "Vincent Warmerdam - Keynote \"Natural Intelligence is All You Need [tm]\"", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=C9p7suS-NGk" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json new file mode 100644 index 000000000..f8b64ecae --- /dev/null +++ b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json @@ -0,0 +1,28 @@ +{ + "description": "Don\u2019t judge a book by its cover: Using LLM created datasets to train models that detect literary features\n\nExisting book recommendation systems like Goodreads are based on correlating the reading habits of people. But what if you want a humorous book? Or a book that is set in 19th century Paris? Or a thriller, but without violence?\nWe build book recommendation systems for Dutch libraries based on more than a dozen features from historical setting, to writing style, to main character characteristics. This allows us to tailor each recommendation to individual readers.\n\nThe recent developments in LLMs are an interesting area for us to explore to improve our recommendations. However, running LLMs in production is unfortunately not always feasible. The associated costs may be too high, and running code from third parties in your daily pipeline may be undesirable. And then there\u2019s data privacy - or, in our case, intellectual copyright - to be considered as well.\n\nSo how can you reap the benefits of an LLM, without exposing yourself or your company to some of these major downsides?\n\nWe utilized LLMs to generate custom, tailor-made datasets for our literary feature detection models to train on. This allowed us to benefit from the high performance of large language models, without continued reliance on external parties such as OpenAI or Google.\n\nWhile you may think LLMs are not as effective for languages other than English, we\u2019ve seen major improvements in several of our models.\n\nIn this talk, we\u2019ll highlight:\n- A note on recommenders: Why does Goodreads recommender not work for me, while Spotify\u2019s Discover Weekly is so good?\n- Different methods of getting data from books\n- Iterative process of creating a dataset using an LLM and retraining our models\n- Some notes on intellectual property and evaluation of models.\n\nBio:\nWessel Sandtke\nTypewriter repairman turned Machine Learning Engineer, now working for Bookarang, a Dutch startup working with Dutch libraries to improve the recommendations for its members.\nWrote several picture books, but is not allowed to boost those in the recommendation system.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1340, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/LERXLF4a8JM/maxresdefault.jpg", + "title": "Wessel Sandtke - Don\u2019t judge a book by its cover: Using LLM created datasets to train models...", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=LERXLF4a8JM" + } + ] +} diff --git a/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json new file mode 100644 index 000000000..03f9fd3f7 --- /dev/null +++ b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json @@ -0,0 +1,28 @@ +{ + "description": "Fraud is a major problem for financial services companies. As fraudsters change tactics, our detection methods need to get smarter. Graph neural networks (GNNs) are a promising model to improve detection performance. Unlike traditional machine learning models or rule-based engines, GNNs can effectively learn from subtle relationships by aggregating neighborhood information in the financial transaction networks. However, it remains a challenge to adopt this new approach in production.\n\nThe goal of this talk is to share best practices for building a production ready GNN solution and hopefully spark your interest to apply GNNs to your own use cases.\n\nIn this talk, we focus on suspicious account detection for online marketplaces. These platforms allow users to set up shops and sell products with little friction. Unfortunately, this attracts fraudsters who abuse these platforms. We use GNNs to do supervised learning based on accounts previously flagged as fraudulent, so that we can learn from both account properties and the relationship between accounts. However, productionizing GNNs is a big challenge. Addressing this challenge purely using open source packages is the main focus of this talk.\n\nWe first give an overview of GNN-based fraud detection. Then we deep dive into utilizing PySpark and GraphFrames to build a transaction graph in a scalable way and convert it to DGL (Deep Graph Library) format. Next we share our experiences of setting up training and inference graphs in different time intervals, and deploying the end-to-end model pipeline in Airflow.\n\nAttendees are required to have a basic understanding of machine learning. In this informative talk, they will gain insights into fraud detection's challenges and learn best practices to productionize GNNs.\n\nBios:\nFeng Zhao\nFeng is a senior data scientist at Adyen. He is passionate about solving real business problems using innovative AI/machine learning approaches. He received his Ph.D. from the National University of Singapore.\n\nTingting Qiao\nSenior data scientist in Adyen, working in the Score team focusing on fraud detection.\nHaving PhD background in computer vision and natural language processing using deep neural networks. Familiar with prediction models, such as regression, classification models, etc., as well as the latest research techniques, such as adversarial learning, neural networks etc. Several years of experience with popular deep learning frameworks.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "duration": 1453, + "language": "eng", + "recorded": "2023-09-14", + "related_urls": [ + { + "label": "Conference Website", + "url": "https://amsterdam2023.pydata.org/cfp/schedule/" + }, + { + "label": "https://github.com/numfocus/YouTubeVideoTimestamps", + "url": "https://github.com/numfocus/YouTubeVideoTimestamps" + } + ], + "speakers": [ + "TODO" + ], + "tags": [], + "thumbnail_url": "https://i.ytimg.com/vi/z_s-RUku2_4/maxresdefault.jpg", + "title": "Zhao & Qiao - Graph Neural Networks for Real World Fraud Detection | PyData Amsterdam 2023", + "videos": [ + { + "type": "youtube", + "url": "https://www.youtube.com/watch?v=z_s-RUku2_4" + } + ] +} From e9830d6fb0530015c3f0da9366ca09d3a811f22e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ezequiel=20Leonardo=20Casta=C3=B1o?= <14986783+ELC@users.noreply.github.com> Date: Sat, 28 Jun 2025 00:20:44 -0300 Subject: [PATCH 2/4] Add speaker information --- ...s-to-the-rescue-pydata-amsterdam-2023.json | 4 +-- ...-and-everything-in-between-pdams-2023.json | 5 ++-- ...thon-open-source-ecosystem-pdams-2023.json | 4 +-- ...oding-and-when-to-use-what-pdams-2023.json | 4 +-- ...ian-tournaments-pydata-amsterdam-2023.json | 4 +-- ...face-and-skorch-pydata-amsterdam-2023.json | 4 +-- ...lue-chain-as-a-product-data-scientist.json | 2 +- ...-with-duckdb-and-arrowflight-pdams-23.json | 5 ++-- ...boosting-models-pydata-amsterdam-2023.json | 5 ++-- ...sis-a-deep-dive-pydata-amsterdam-2023.json | 4 +-- ...s-of-data-testing-hell-still-relevant.json | 4 +-- ...m-in-production-pydata-amsterdam-2023.json | 4 +-- ...r-machine-learning-model-optimization.json | 2 +- ...abyte-data-lake-pydata-amsterdam-2023.json | 4 +-- ...mpaigns-under-uncertainty-with-pystan.json | 2 +- ...players-in-pymc-pydata-amsterdam-2023.json | 4 +-- ...-computer-vision-pipelines-pdams-2023.json | 5 ++-- ...transfer-with-compact-neural-networks.json | 4 +-- ...s-forecasting-in-the-renewable-energy.json | 4 +-- ...e-language-models-tools-and-knowledge.json | 7 +++-- ...e-way-of-eating-pydata-amsterdam-2023.json | 4 +-- ...x-max-in-pandas-pydata-amsterdam-2023.json | 4 +-- ...ess-to-my-to-do-list-pydata-amsterdam.json | 4 +-- ...n-wasn-t-needed-pydata-amsterdam-2023.json | 4 +-- ...ithout-dystopia-pydata-amsterdam-2023.json | 8 ++++-- ...i-d-like-them-to-do-pd-amsterdam-2023.json | 4 +-- ...rated-learning-with-flower-pdams-2023.json | 4 +-- ...mmers-ok-doomer-pydata-amsterdam-2023.json | 4 +-- ...ng-with-distance-functions-pdams-2023.json | 4 +-- ...man-in-the-loop-pydata-amsterdam-2023.json | 4 +-- ...p-monitor-wildlife-in-parks-in-africa.json | 5 ++-- ...ation-impact-and-detection-pdams-2023.json | 4 +-- ...roach-with-insights-from-the-industry.json | 2 +- ...ent-with-duckdb-pydata-amsterdam-2023.json | 5 ++-- ...of-news-readers-pydata-amsterdam-2023.json | 5 ++-- ...al-driven-machine-learning-pdams-2023.json | 4 +-- .../pydata-amsterdam-2023-opening-notes.json | 28 ------------------- ...odel-deployment-pydata-amsterdam-2023.json | 4 +-- ...ne-with-dagster-pydata-amsterdam-2023.json | 4 +-- ...i-level-forecasting-models-pdams-2023.json | 4 +-- ...ess-and-pressure-as-a-data-specialist.json | 4 +-- ...pression-engine-pydata-amsterdam-2023.json | 4 +-- ...ial-intelligence-in-cultural-heritage.json | 4 +-- ...-clusters-using-kubernetes-pdams-2023.json | 4 +-- ...ghts-from-the-music-industry-pdams-23.json | 4 +-- ...n-your-screen-to-a-meal-on-your-plate.json | 4 +-- ...tainable-etl-workflows-at-booking-com.json | 3 +- ...ions-in-no-time-with-taipy-pdams-2023.json | 5 ++-- ...dam-greener-safer-and-more-accessible.json | 3 +- ...-pandas-changes-pydata-amsterdam-2023.json | 4 +-- ...-context-window-pydata-amsterdam-2023.json | 8 ++++-- ...tural-intelligence-is-all-you-need-tm.json | 4 +-- ...-llm-created-datasets-to-train-models.json | 4 +-- ...fraud-detection-pydata-amsterdam-2023.json | 5 ++-- 54 files changed, 120 insertions(+), 130 deletions(-) delete mode 100644 pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json diff --git a/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json index 71987e1dd..7744592a4 100644 --- a/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json @@ -30,11 +30,11 @@ } ], "speakers": [ - "TODO" + "Adrin Jalali" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/9w_H5OSTO9A/maxresdefault.jpg", - "title": "Adrin - Let\u2019s exploit pickle, and `skops` to the rescue! | PyData Amsterdam 2023", + "title": "Let's exploit pickle, and `skops` to the rescue!", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json index 671a88fa9..e3e9416a9 100644 --- a/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Alon Nir", + "Dror A. Guldin" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/Yd35Q2oclY8/maxresdefault.jpg", - "title": "Alon Nir & Dror A. Guldin - Power Users, Long Tail Users, and Everything In Between... | PDAMS 2023", + "title": "Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json index 0b9302106..2e8d86029 100644 --- a/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Alyona Galyeva" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/YGKqvMhaEVA/maxresdefault.jpg", - "title": "Alyona Galyeva - Data Contracts in action powered by Python open source ecosystem | PDAMS 2023", + "title": "Data Contracts in action powered by Python open source ecosystem", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json index d638f6031..a893e762c 100644 --- a/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Ana Chaloska" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/4Opsiqj6gcY/maxresdefault.jpg", - "title": "Ana Chaloska - To One-Hot or Not: A guide to feature encoding and when to use what | PDAMS 2023", + "title": "To One-Hot or Not: A guide to feature encoding and when to use what", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json index 133be0e05..393511f04 100644 --- a/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Andy Kitchen" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/UY3wxjk2o6o/maxresdefault.jpg", - "title": "Andy Kitchen - Promptly Evaluating Prompts with Bayesian Tournaments | PyData Amsterdam 2023", + "title": "Promptly Evaluating Prompts with Bayesian Tournaments", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json index 189ae5f3a..2a4e774d7 100644 --- a/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json @@ -18,11 +18,11 @@ } ], "speakers": [ - "TODO" + "Benjamin Bossan" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/y_n7BjDCS-M/maxresdefault.jpg", - "title": "Bossan - Extend your scikit-learn workflow with Hugging Face and skorch | PyData Amsterdam 2023", + "title": "Extend your scikit-learn workflow with Hugging Face and skorch", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json index 1e22e1216..e883f98c8 100644 --- a/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json +++ b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json @@ -14,7 +14,7 @@ } ], "speakers": [ - "TODO" + "Azamat Omuraliev" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/NypFnCRjXJQ/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json index 4d3708833..6df38c28c 100644 --- a/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json +++ b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Fabio Buso", + "Till Döhmen" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/w_cAeE5ShnM/maxresdefault.jpg", - "title": "Buso & D\u00f6hmen - MLOps on the fly: Optimizing a feature store with DuckDB and ArrowFlight | PDAMS 23", + "title": "MLOps on the fly: Optimizing a feature store with DuckDB and ArrowFlight", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json index 797b4e4d8..dc973843f 100644 --- a/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Busra Cikla", + "Paul Zhutovsky" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/lmQw_B-JP9o/maxresdefault.jpg", - "title": "Cikla & Zhutovsky - Transfer Learning in Boosting Models | PyData Amsterdam 2023", + "title": "Transfer Learning in Boosting Models", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json index da3e46ea2..c5d250a0a 100644 --- a/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Danial Senejohnny" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/I33h5-GmHSM/maxresdefault.jpg", - "title": "Danial Senejohnny - Survival Analysis: a deep dive | PyData Amsterdam 2023", + "title": "Survival Analysis: a deep dive", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json index 71b0dd642..0c7dda74f 100644 --- a/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json +++ b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json @@ -18,11 +18,11 @@ } ], "speakers": [ - "TODO" + "Daniel van der Ende" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/05py1CEyLxo/maxresdefault.jpg", - "title": "Daniel van der Ende- Return to Data's Inferno: are the 7 layers of data testing hell still relevant?", + "title": "Return to Data's Inferno: are the 7 layers of data testing hell still relevant?", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json index a7e26e2dd..e02194306 100644 --- a/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Emeli Dral" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/aLwDCU8KZB0/maxresdefault.jpg", - "title": "Emeli Dral - Mind the language: how to monitor NLP and LLM in production | PyData Amsterdam 2023", + "title": "Mind the language: how to monitor NLP and LLM in production", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json index a6026a976..9bd2401bd 100644 --- a/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json +++ b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json @@ -14,7 +14,7 @@ } ], "speakers": [ - "TODO" + "Felipe Moraes" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/XUFS-jVpKIc/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json index bf7689921..0261c6d6f 100644 --- a/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Fokko Driesprong" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/1A7fFB8QTPY/maxresdefault.jpg", - "title": "Fokko Driesprong - PyIceberg: Tipping your toes into the petabyte data-lake | PyData Amsterdam 2023", + "title": "PyIceberg: Tipping your toes into the petabyte data-lake", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json index 49aca70d7..4d4ad7445 100644 --- a/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json +++ b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json @@ -26,7 +26,7 @@ } ], "speakers": [ - "TODO" + "Raphael de Brito Tamaki" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/hcQST0RnN_o/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json index b72d01d5e..bddbae400 100644 --- a/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Francesco Bruzzesi" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/z79fClXBOnM/maxresdefault.jpg", - "title": "Francesco Bruzzesi - Bayesian ranking for tennis players in PyMC | PyData Amsterdam 2023", + "title": "Bayesian ranking for tennis players in PyMC", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json index 7398c9c52..2e7ad7bd7 100644 --- a/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Wesley Boelrijk", + "Jeroen Rombouts" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/z2DJzByPKJE/maxresdefault.jpg", - "title": "From Vision to Action: Designing and Deploying Effective Computer Vision Pipelines | PDAMS 2023", + "title": "From Vision to Action: Designing and Deploying Effective Computer Vision Pipelines", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json index d71d8ca1d..159bcc7fe 100644 --- a/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json +++ b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json @@ -10,11 +10,11 @@ } ], "speakers": [ - "TODO" + "Hadi Abdi Khojasteh" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/2YmGm0yf6fc/maxresdefault.jpg", - "title": "Hadi Abdi Khojasteh - Distillation Unleashed: Domain Knowledge Transfer with Compact Neural Networks", + "title": "Distillation Unleashed: Domain Knowledge Transfer with Compact Neural Networks", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json index c206de22e..f57e45801 100644 --- a/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json +++ b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Alexander Backus" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/aIZf2cQ0r5U/maxresdefault.jpg", - "title": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy...", + "title": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy transition", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json index d96561318..e5799436f 100644 --- a/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json +++ b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json @@ -14,11 +14,14 @@ } ], "speakers": [ - "TODO" + "Thomas Wolf", + "Alessandro Cappelli", + "Julien Launay", + "Guilherme Penedo" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/iQmXF5gxaWY/maxresdefault.jpg", - "title": "Hugging Face\"Processing billions of tokens for training Large Language Models, tools and knowledge\"", + "title": "Processing billions of tokens for training Large Language Models, tools and knowledge", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json index 7648ca86d..f983a4aec 100644 --- a/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Jakob Willisch" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/bABsVVbMyJc/maxresdefault.jpg", - "title": "Jakob Willisch - The proof of the pudding is in the (way of) eating... | PyData Amsterdam 2023", + "title": "The proof of the pudding is in the (way of) eating: quasi-experimental methods of causal inference and their practical pitfalls", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json index 5565838f5..fcfcd30ac 100644 --- a/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "James Powell" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/IyjLM-9Eq3c/maxresdefault.jpg", - "title": "James Powell - Cumulative Index Max in pandas | PyData Amsterdam 2023", + "title": "Cumulative Index Max in pandas", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json index 90d450865..5af8bc718 100644 --- a/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json +++ b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Jordi Smit" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/-rixfb4LWiY/maxresdefault.jpg", - "title": "Jordi Smit - LLM Agents 101: How I Gave ChatGPT Access to My To-Do List | PyData Amsterdam", + "title": "LLM Agents 101: How I Gave ChatGPT Access to My To-Do List", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json index 06b14ef50..62b81f567 100644 --- a/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Katharine Jarmul" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/bEMK2w6e5xY/maxresdefault.jpg", - "title": "Katharine Jarmul - Encrypted Computation: What if decryption wasn't needed? | PyData Amsterdam 2023", + "title": "Encrypted Computation: What if decryption wasn't needed?", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json index 832b15259..39f1135cd 100644 --- a/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json @@ -14,11 +14,13 @@ } ], "speakers": [ - "TODO" + "Katharine Jarmul" + ], + "tags": [ + "Keynote" ], - "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/hUawmdYWAO0/maxresdefault.jpg", - "title": "Katharine Jarmul - Keynote \"AI Without Dystopia\" | PyData Amsterdam 2023", + "title": "AI Without Dystopia", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json index 97e2abe14..36576095f 100644 --- a/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Kevin Klein" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/cRS4yZt6OU4/maxresdefault.jpg", - "title": "Kevin Klein - Causal Inference Libraries: What They Do, What I'd Like Them To Do | PD Amsterdam 2023", + "title": "Causal Inference Libraries: What They Do, What I'd Like Them To Do", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json index 06dfccde7..5c7a0d207 100644 --- a/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Krishi Sharma" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/8njnK_nexEw/maxresdefault.jpg", - "title": "Krishi Sharma - Innovation in the Age of Regulation: Federated Learning with Flower | PDAMS 2023", + "title": "Innovation in the Age of Regulation: Federated Learning with Flower", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json index a35d8b9e1..fb922b8e5 100644 --- a/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Laura Summers" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/n88ZmqsTKig/maxresdefault.jpg", - "title": "Laura Summers - Ok, Doomer | PyData Amsterdam 2023", + "title": "Ok, Doomer", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json index ef4ac9a2a..e30538984 100644 --- a/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Tony Bagnall" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/O5cnKAUBKkg/maxresdefault.jpg", - "title": "Lets do the time warp again: time series machine learning with distance functions | PDAMS 2023", + "title": "Lets do the time warp again: time series machine learning with distance functions", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json index 6a064d8bb..d36ec9463 100644 --- a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json @@ -18,11 +18,11 @@ } ], "speakers": [ - "TODO" + "Lieke Kools" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/Hb_yIwkABQU/maxresdefault.jpg", - "title": "Lieke Kools - Standby detection with a human in the loop | PyData Amsterdam 2023", + "title": "Standby detection with a human in the loop", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json index 40ba51375..a48a583d5 100644 --- a/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json +++ b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Maël Deschamps", + "Simone Gayed Said" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/CSsaGm9eJc0/maxresdefault.jpg", - "title": "Ma\u00ebl Deschamps - Our journey using data and AI to help monitor wildlife in parks in Africa", + "title": "Our journey using data and AI to help monitor wildlife in parks in Africa", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json index c920e6af5..7f16fea39 100644 --- a/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Maryam Miradi" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/BlVZddcsyV4/maxresdefault.jpg", - "title": "Maryam Miradi - Deep look into Deepfakes: Mastering Creation, Impact, and Detection | PDAMS 2023", + "title": "Deep look into Deepfakes: Mastering Creation, Impact, and Detection", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json index 4a6d2b90c..530cb3367 100644 --- a/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json +++ b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json @@ -14,7 +14,7 @@ } ], "speakers": [ - "TODO" + "Ildar Safilo" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/cQJfYtTfJQg/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json index 42e085c60..fc150d57f 100644 --- a/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Hannes Mühleisen", + "Mark Raasveldt" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/5ddoZR6PYNU/maxresdefault.jpg", - "title": "M\u00fchleisen & Raasveldt - In-Process Analytical Data Management with DuckDB | PyData Amsterdam 2023", + "title": "In-Process Analytical Data Management with DuckDB", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json index 0211bbe07..9002c44c5 100644 --- a/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Jurriaan Nagelkerke", + "Vincent Smeets" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/hLbYLP1XCfI/maxresdefault.jpg", - "title": "Nagelkerke & Smeets - Revealing the true motives of news readers | PyData Amsterdam 2023", + "title": "Revealing the true motives of news readers", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json index 546c3ba4e..8f2ada600 100644 --- a/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Okke van der Wal" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/c_dOpCvkNc0/maxresdefault.jpg", - "title": "Okke van der Wal - Personalization at Uber scale via causal-driven machine learning | PDAMS 2023", + "title": "Personalization at Uber scale via causal-driven machine learning", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json b/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json deleted file mode 100644 index 78705d58c..000000000 --- a/pydata-amsterdam-2023/videos/pydata-amsterdam-2023-opening-notes.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "description": "Opening Notes presented by Leah Silen, Executive Director of NumFOCUS.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", - "duration": 1016, - "language": "eng", - "recorded": "2023-09-14", - "related_urls": [ - { - "label": "Conference Website", - "url": "https://amsterdam2023.pydata.org/cfp/schedule/" - }, - { - "label": "https://github.com/numfocus/YouTubeVideoTimestamps", - "url": "https://github.com/numfocus/YouTubeVideoTimestamps" - } - ], - "speakers": [ - "TODO" - ], - "tags": [], - "thumbnail_url": "https://i.ytimg.com/vi/_nlryVPWTGM/maxresdefault.jpg", - "title": "PyData Amsterdam 2023 - Opening Notes", - "videos": [ - { - "type": "youtube", - "url": "https://www.youtube.com/watch?v=_nlryVPWTGM" - } - ] -} diff --git a/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json index 8d27c1d5b..34d232b39 100644 --- a/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Ziad Al Moubayed" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/KVEULeK1zk4/maxresdefault.jpg", - "title": "Reliable and Scalable ML Serving: Best Practices for Online Model Deployment | PyData Amsterdam 2023", + "title": "Reliable and Scalable ML Serving: Best Practices for Online Model Deployment", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json index 41e2034bf..7a3fb3d84 100644 --- a/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Riccardo Amadio" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/VilruuAAwp8/maxresdefault.jpg", - "title": "Riccardo Amadio | Declarative data manipulation pipeline with Dagster | PyData Amsterdam 2023", + "title": "Declarative data manipulation pipeline with Dagster", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json index 56fcf21d1..37b56ac71 100644 --- a/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Rik van der Vlist" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/30x-TYxZ6QY/maxresdefault.jpg", - "title": "Rik van der Vlist - Balancing the electricity grid with multi-level forecasting models | PDAMS 2023", + "title": "Balancing the electricity grid with multi-level forecasting models", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json index 85cbf5880..533ec3f56 100644 --- a/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json +++ b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Maarten Oude Rikmanspoel" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/xhvJa7ETy2E/maxresdefault.jpg", - "title": "Rikmanspoel - import full-focus as ff \u2013 How to reduce stress and pressure as a data specialist", + "title": "import full-focus as ff - How to reduce stress and pressure as a data specialist", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json index e4ba36600..459edd7d8 100644 --- a/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Ritchie Vink" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/NJbBWDzZuWs/maxresdefault.jpg", - "title": "Ritchie Vink - Polars and a peek into the expression engine | PyData Amsterdam 2023", + "title": "Polars and a peek into the expression engine", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json index 6ef2fc324..52d682e9f 100644 --- a/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json +++ b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Robert Erdmann" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/kMfl5SzfkVc/maxresdefault.jpg", - "title": "Robert Erdmann - Keynote - Python for Imaging and Artificial Intelligence in Cultural Heritage", + "title": "Python for Imaging and Artificial Intelligence in Cultural Heritage", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json index fb5741639..f19738b67 100644 --- a/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Jorrick Sleijster" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/kgkZmk4EvWA/maxresdefault.jpg", - "title": "Sleijster - Achieving developer autonomy on on-premise data clusters using Kubernetes | PDAMS 2023", + "title": "Achieving developer autonomy on on-premise data clusters using Kubernetes", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json index aaf9d3137..951a6b526 100644 --- a/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json +++ b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Nali M. Alagöz" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/rkP4TK1SPVQ/maxresdefault.jpg", - "title": "Staggered Difference-in-Differences in Practice: Causal Insights from the Music Industry | PDAMS 23", + "title": "Staggered Difference-in-Differences in Practice: Causal Insights from the Music Industry", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json index e585ce175..9322ffb74 100644 --- a/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json +++ b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Maarten Sukel" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/ZZLQ1KJLRYE/maxresdefault.jpg", - "title": "Sukel - Multimodal Product Demand Forecasting: From pixels on your screen to a meal on your plate", + "title": "Multimodal Product Demand Forecasting: From pixels on your screen to a meal on your plate", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json index 318b2176c..76c700946 100644 --- a/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json +++ b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json @@ -14,7 +14,8 @@ } ], "speakers": [ - "TODO" + "Bram van den Akker", + "Jon Smith" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/gTARHyGrcq0/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json index 896eef281..16e1dbb49 100644 --- a/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Florian Jacta", + "Alexandre Sajus" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/REYDT6FYHAc/maxresdefault.jpg", - "title": "Turning your Data/AI algorithms into full web applications in no time with Taipy | PDAMS 2023", + "title": "Turning your Data/AI algorithms into full web applications in no time with Taipy", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json index 299f68250..81ef323e7 100644 --- a/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json +++ b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json @@ -14,7 +14,8 @@ } ], "speakers": [ - "TODO" + "Shayla Jansen", + "Niek IJzerman" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/fch2WAEyyc0/maxresdefault.jpg", diff --git a/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json index 60341ff60..bd6728435 100644 --- a/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json @@ -18,11 +18,11 @@ } ], "speakers": [ - "TODO" + "Joris Van den Bossche" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/z47QwqDUKTo/maxresdefault.jpg", - "title": "Van den Bossche - What the PDEP? An overview of some upcoming pandas changes | PyData Amsterdam 2023", + "title": "What the PDEP? An overview of some upcoming pandas changes", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json index c548f172b..393c85914 100644 --- a/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json @@ -14,11 +14,13 @@ } ], "speakers": [ - "TODO" + "Vicki Boykis" + ], + "tags": [ + "Keynote" ], - "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/icGHT7MiaFY/maxresdefault.jpg", - "title": "Vicki Boykis - Keynote \"Build and keep your context window\" | PyData Amsterdam 2023", + "title": "Build and keep your context window", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json index 008f80693..5c6bca8c2 100644 --- a/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json +++ b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Vincent Warmerdam" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/C9p7suS-NGk/maxresdefault.jpg", - "title": "Vincent Warmerdam - Keynote \"Natural Intelligence is All You Need [tm]\"", + "title": "Natural Intelligence is All You Need [tm]", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json index f8b64ecae..a77375547 100644 --- a/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json +++ b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json @@ -14,11 +14,11 @@ } ], "speakers": [ - "TODO" + "Wessel Sandtke" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/LERXLF4a8JM/maxresdefault.jpg", - "title": "Wessel Sandtke - Don\u2019t judge a book by its cover: Using LLM created datasets to train models...", + "title": "Don't judge a book by its cover: Using LLM created datasets to train models that detect literary features", "videos": [ { "type": "youtube", diff --git a/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json index 03f9fd3f7..68e08ab25 100644 --- a/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json @@ -14,11 +14,12 @@ } ], "speakers": [ - "TODO" + "Feng Zhao", + "Tingting Qiao" ], "tags": [], "thumbnail_url": "https://i.ytimg.com/vi/z_s-RUku2_4/maxresdefault.jpg", - "title": "Zhao & Qiao - Graph Neural Networks for Real World Fraud Detection | PyData Amsterdam 2023", + "title": "Graph Neural Networks for Real World Fraud Detection", "videos": [ { "type": "youtube", From 906e16c8f87630a6dc37a524e03f8f64e0ec0aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ezequiel=20Leonardo=20Casta=C3=B1o?= <14986783+ELC@users.noreply.github.com> Date: Sat, 28 Jun 2025 00:22:22 -0300 Subject: [PATCH 3/4] Fix conflicting characters --- ...it-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json | 2 +- ...rs-long-tail-users-and-everything-in-between-pdams-2023.json | 2 +- ...tion-powered-by-python-open-source-ecosystem-pdams-2023.json | 2 +- ...ide-to-feature-encoding-and-when-to-use-what-pdams-2023.json | 2 +- ...prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json | 2 +- ...flow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json | 2 +- ...-validating-the-value-chain-as-a-product-data-scientist.json | 2 +- ...ng-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json | 2 +- ...nsfer-learning-in-boosting-models-pydata-amsterdam-2023.json | 2 +- ...nny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json | 2 +- ...no-are-the-7-layers-of-data-testing-hell-still-relevant.json | 2 +- ...monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json | 2 +- ...usiness-metrics-for-machine-learning-model-optimization.json | 2 +- ...-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json | 2 +- ...v-for-marketing-campaigns-under-uncertainty-with-pystan.json | 2 +- ...anking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json | 2 +- ...eploying-effective-computer-vision-pipelines-pdams-2023.json | 2 +- ...-domain-knowledge-transfer-with-compact-neural-networks.json | 2 +- ...ilistic-time-series-forecasting-in-the-renewable-energy.json | 2 +- ...-for-training-large-language-models-tools-and-knowledge.json | 2 +- ...e-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json | 2 +- ...ll-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json | 2 +- ...i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json | 2 +- ...-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json | 2 +- ...armul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json | 2 +- ...what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json | 2 +- ...of-regulation-federated-learning-with-flower-pdams-2023.json | 2 +- .../videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json | 2 +- ...ies-machine-learning-with-distance-functions-pdams-2023.json | 2 +- ...etection-with-a-human-in-the-loop-pydata-amsterdam-2023.json | 2 +- ...data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json | 2 +- ...akes-mastering-creation-impact-and-detection-pdams-2023.json | 2 +- ...an-a-b-testing-approach-with-insights-from-the-industry.json | 2 +- ...tical-data-management-with-duckdb-pydata-amsterdam-2023.json | 2 +- ...-the-true-motives-of-news-readers-pydata-amsterdam-2023.json | 2 +- ...ber-scale-via-causal-driven-machine-learning-pdams-2023.json | 2 +- ...tices-for-online-model-deployment-pydata-amsterdam-2023.json | 2 +- ...anipulation-pipeline-with-dagster-pydata-amsterdam-2023.json | 2 +- ...ity-grid-with-multi-level-forecasting-models-pdams-2023.json | 2 +- ...-how-to-reduce-stress-and-pressure-as-a-data-specialist.json | 2 +- ...a-peek-into-the-expression-engine-pydata-amsterdam-2023.json | 2 +- ...maging-and-artificial-intelligence-in-cultural-heritage.json | 2 +- ...on-on-premise-data-clusters-using-kubernetes-pdams-2023.json | 2 +- ...actice-causal-insights-from-the-music-industry-pdams-23.json | 2 +- ...ting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json | 2 +- ...oc-scripts-to-maintainable-etl-workflows-at-booking-com.json | 2 +- ...-full-web-applications-in-no-time-with-taipy-pdams-2023.json | 2 +- ...-ai-to-make-amsterdam-greener-safer-and-more-accessible.json | 2 +- ...w-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json | 2 +- ...uild-and-keep-your-context-window-pydata-amsterdam-2023.json | 2 +- ...rmerdam-keynote-natural-intelligence-is-all-you-need-tm.json | 2 +- ...by-its-cover-using-llm-created-datasets-to-train-models.json | 2 +- ...ks-for-real-world-fraud-detection-pydata-amsterdam-2023.json | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json index 7744592a4..72277ef58 100644 --- a/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/adrin-lets-exploit-pickle-and-skops-to-the-rescue-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Pickle files can be evil and simply loading them can run arbitrary code on your system. This talk presents why that is, how it can be exploited, and how skops is tackling the issue for scikit-learn/statistical ML models. We go through some lower level pickle related machinery, and go in detail how the new format works.\n\nThe pickle format has many vulnerabilities and loading them alone can run arbitrary code on the user\u2019s system [1]. In this session we go through the process used by the pickle module to persist python objects, while demonstrating how they can be exploited. We go through how __getstate__ and __setstate__ are used, and how the output of a __reduce__ method is used to reconstruct an object, and how one can have a malicious implementation of these methods to create a malicious pickle file without knowing how to manually create a pickle file by manipulating a file on a lower level. We also briefly touch on other known exploits and issues related to the format [2].\n\nWe also show how one can look inside a pickle file and the operations run by it while loading it, and how one could get an equivalent python script which would result in the output of the pickle file [3]\nThen I present an alternative format from the skops library [4] which can be used to store scikit-learn based models. We talk about what the format is, and how persistence and loading is done, and what we do to prevent loading malicious objects or to avoid running arbitrary code. This format can be used to store almost any scikit-learn estimator, as well as xgboost, lightgbm, and catboost models.\n\n[1] https://peps.python.org/pep-0307/#security-issues\n[2] https://github.com/moreati/pickle-fuzz\n[3] https://github.com/trailofbits/fickling\n[4] https://skops.readthedocs.io/en/stable/persistence.html\n\nBio:\nAdrin\nAdrin works on a few open source projects including skops which tackles some of the MLOps challenges related to scikit-learn models. He has a PhD in Bioinformatics, has worked as a consultant, and in an algorithmic privacy and fairness team. He's also a core developer of scikit-learn and fairlearn.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Pickle files can be evil and simply loading them can run arbitrary code on your system. This talk presents why that is, how it can be exploited, and how skops is tackling the issue for scikit-learn/statistical ML models. We go through some lower level pickle related machinery, and go in detail how the new format works.\n\nThe pickle format has many vulnerabilities and loading them alone can run arbitrary code on the user\u2019s system [1]. In this session we go through the process used by the pickle module to persist python objects, while demonstrating how they can be exploited. We go through how __getstate__ and __setstate__ are used, and how the output of a __reduce__ method is used to reconstruct an object, and how one can have a malicious implementation of these methods to create a malicious pickle file without knowing how to manually create a pickle file by manipulating a file on a lower level. We also briefly touch on other known exploits and issues related to the format [2].\n\nWe also show how one can look inside a pickle file and the operations run by it while loading it, and how one could get an equivalent python script which would result in the output of the pickle file [3]\nThen I present an alternative format from the skops library [4] which can be used to store scikit-learn based models. We talk about what the format is, and how persistence and loading is done, and what we do to prevent loading malicious objects or to avoid running arbitrary code. This format can be used to store almost any scikit-learn estimator, as well as xgboost, lightgbm, and catboost models.\n\n[1] https://peps.python.org/pep-0307/#security-issues\n[2] https://github.com/moreati/pickle-fuzz\n[3] https://github.com/trailofbits/fickling\n[4] https://skops.readthedocs.io/en/stable/persistence.html\n\nBio:\nAdrin\nAdrin works on a few open source projects including skops which tackles some of the MLOps challenges related to scikit-learn models. He has a PhD in Bioinformatics, has worked as a consultant, and in an algorithmic privacy and fairness team. He's also a core developer of scikit-learn and fairlearn.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1339, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json index e3e9416a9..5de990da3 100644 --- a/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/alon-nir-dror-a-guldin-power-users-long-tail-users-and-everything-in-between-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy\n\nData scientists in industry often have to wear many hats. They must navigate statistical validity, business acumen and strategic thinking, while also representing the end user. In this talk, we will talk about the pillars that make a metric the right one for a job, and how to choose appropriate Key Performance Indicators (KPIs) to drive product success and strategic gains.\n\nOur presentation will traverse the relationship of data science skills in product strategy - embracing the multifaceted role of the data scientist and navigating the journey from user segmentation to making data-driven decisions.\n\nThe Data Scientist's Hat Trick: We initiate by emphasising the assorted roles that a data scientist plays in today's business landscape - from being a statistician ensuring the accuracy and validity of data to a strategist driving business decisions. [5 mins]\n\nChoosing Significant Metrics: Next, we'll delve into the nuances of selecting the right metric for the job. Specifically, we\u2019ll talk about the different pillars of metrics setting, for common data science responsibilities such as randomised controlled trials, offline evaluation, opportunity analysis etc. [7 mins]\n\nSetting The Right KPIs: Once metrics are defined, we'll venture into setting the correct KPIs - the small set of top line numbers that say if our venture is doing the job. [7 mins]\n\nData-Driven Decision Making: Lastly, we'll elucidate how to leverage the data you've gathered to make informed, strategic decisions. This necessitates interpreting your metrics and KPIs, spotting trends, and making necessary adjustments to stay on course. [7 mins]\n\nIncorporating real-world case studies, we'll demonstrate how these concepts intertwine to contribute to product success.\n\nLearning Objectives:\n* Appreciate the multifaceted role of a data scientist in driving product strategies.\n* Learn to set realistic and challenging KPIs that align with your company's overarching objectives.\n* Gain insights into leveraging data for informed decision-making and product strategy adjustments.\n\nBio:\nAlon Nir\nData scientist (Data Lead) at Spotify. Dismal scientist by education. Advocating against pie charts since 2015. Self-proclaimed GIF connoisseur.\n\nDror A. Guldin\nData Scientist (Tech Lead) at Meta\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy\n\nData scientists in industry often have to wear many hats. They must navigate statistical validity, business acumen and strategic thinking, while also representing the end user. In this talk, we will talk about the pillars that make a metric the right one for a job, and how to choose appropriate Key Performance Indicators (KPIs) to drive product success and strategic gains.\n\nOur presentation will traverse the relationship of data science skills in product strategy - embracing the multifaceted role of the data scientist and navigating the journey from user segmentation to making data-driven decisions.\n\nThe Data Scientist's Hat Trick: We initiate by emphasising the assorted roles that a data scientist plays in today's business landscape - from being a statistician ensuring the accuracy and validity of data to a strategist driving business decisions. [5 mins]\n\nChoosing Significant Metrics: Next, we'll delve into the nuances of selecting the right metric for the job. Specifically, we\u2019ll talk about the different pillars of metrics setting, for common data science responsibilities such as randomised controlled trials, offline evaluation, opportunity analysis etc. [7 mins]\n\nSetting The Right KPIs: Once metrics are defined, we'll venture into setting the correct KPIs - the small set of top line numbers that say if our venture is doing the job. [7 mins]\n\nData-Driven Decision Making: Lastly, we'll elucidate how to leverage the data you've gathered to make informed, strategic decisions. This necessitates interpreting your metrics and KPIs, spotting trends, and making necessary adjustments to stay on course. [7 mins]\n\nIncorporating real-world case studies, we'll demonstrate how these concepts intertwine to contribute to product success.\n\nLearning Objectives:\n* Appreciate the multifaceted role of a data scientist in driving product strategies.\n* Learn to set realistic and challenging KPIs that align with your company's overarching objectives.\n* Gain insights into leveraging data for informed decision-making and product strategy adjustments.\n\nBio:\nAlon Nir\nData scientist (Data Lead) at Spotify. Dismal scientist by education. Advocating against pie charts since 2015. Self-proclaimed GIF connoisseur.\n\nDror A. Guldin\nData Scientist (Tech Lead) at Meta\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1707, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json index 2e8d86029..10ddc9b66 100644 --- a/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/alyona-galyeva-data-contracts-in-action-powered-by-python-open-source-ecosystem-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "This informative talk aims to close the gap between the theory of data contracts and their real-life implementations. It contains a few Python code snippets and is aimed primarily at data and software engineers. However, it could be food for thought for machine learning engineers, data scientists, and other data consumers.\n\nTopic: There are a lot of ongoing discussions happening about data contracts. I would like to share with you some lessons learned from data contract implementations and show you some Python examples.\n\nAudience: data and software engineers; potentially could be interesting for machine learning engineers, data scientists, and other data consumers. Some affinity with Pandas, Great Expectations, and Open Table Formats are desirable.\n\nType: Informative with some hands-on examples\n\nMain takeaways:\n- better understanding of the data contracts concept\n- tips for batch data contracts implementations\n- tips for streaming data contracts implementations\n\nBio: \nAlyona Galyeva\nAlyona Galyeva is an organizer of PyLadies Amsterdam, co-organizer of MLOps and Crafts, Microsoft AI MVP and Principal Engineer at Thoughtworks\nObserve - Optimize - Learn - Repeat\nPassionate about encouraging others to see different perspectives and constructively break the rules.\nI found my joy in building, optimizing, and deploying end-to-end AI and Data Engineering Solutions.\n\n===\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "This informative talk aims to close the gap between the theory of data contracts and their real-life implementations. It contains a few Python code snippets and is aimed primarily at data and software engineers. However, it could be food for thought for machine learning engineers, data scientists, and other data consumers.\n\nTopic: There are a lot of ongoing discussions happening about data contracts. I would like to share with you some lessons learned from data contract implementations and show you some Python examples.\n\nAudience: data and software engineers; potentially could be interesting for machine learning engineers, data scientists, and other data consumers. Some affinity with Pandas, Great Expectations, and Open Table Formats are desirable.\n\nType: Informative with some hands-on examples\n\nMain takeaways:\n- better understanding of the data contracts concept\n- tips for batch data contracts implementations\n- tips for streaming data contracts implementations\n\nBio: \nAlyona Galyeva\nAlyona Galyeva is an organizer of PyLadies Amsterdam, co-organizer of MLOps and Crafts, Microsoft AI MVP and Principal Engineer at Thoughtworks\nObserve - Optimize - Learn - Repeat\nPassionate about encouraging others to see different perspectives and constructively break the rules.\nI found my joy in building, optimizing, and deploying end-to-end AI and Data Engineering Solutions.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1504, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json index a893e762c..2b2666957 100644 --- a/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/ana-chaloska-to-one-hot-or-not-a-guide-to-feature-encoding-and-when-to-use-what-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Have you ever struggled with a multitude of columns created by One Hot Encoder? Or decided to look beyond it, but found it hard to decide which feature encoder would be a good replacement?\n\nGood news, there are many encoding techniques that have been developed to address different types of categorical data. This talk will provide an overview on various encoding methods available in data science, and a guidance on decision making about which one is appropriate for the data at hand.\n\nJoin this talk if you would like to hear about the importance of feature encoding and why it is important to not default to One Hot Encoding in every scenario. It will start with commonly used approaches and will progress into more advanced and powerful techniques which can help extract meaningful information from the data.\n\nFor each presented encoder, after this talk you will know:\n- When to use it\n- When NOT to use it\n- Important considerations specific to the encoder\n- Python library that offers a built-in method with the encoder, facilitating easy integration into feature engineering pipelines.\n\nI will explore different feature encoding approaches and provide guidance for decision-making. I will cover simpler methods like Label, One Hot, and Frequency encoding, progressing to powerful techniques like Target and Rare Label encoding. Finally, I will explain more complex approaches like Weight of Evidence, Hash and Catboost encoding. I will close the talk with summarizing the key takeaways.\n\nTarget Audience:\nData scientists and anyone interested in feature encoding\n\nPrevious experience with feature encoders can be useful but is not mandatory to follow the talk.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Have you ever struggled with a multitude of columns created by One Hot Encoder? Or decided to look beyond it, but found it hard to decide which feature encoder would be a good replacement?\n\nGood news, there are many encoding techniques that have been developed to address different types of categorical data. This talk will provide an overview on various encoding methods available in data science, and a guidance on decision making about which one is appropriate for the data at hand.\n\nJoin this talk if you would like to hear about the importance of feature encoding and why it is important to not default to One Hot Encoding in every scenario. It will start with commonly used approaches and will progress into more advanced and powerful techniques which can help extract meaningful information from the data.\n\nFor each presented encoder, after this talk you will know:\n- When to use it\n- When NOT to use it\n- Important considerations specific to the encoder\n- Python library that offers a built-in method with the encoder, facilitating easy integration into feature engineering pipelines.\n\nI will explore different feature encoding approaches and provide guidance for decision-making. I will cover simpler methods like Label, One Hot, and Frequency encoding, progressing to powerful techniques like Target and Rare Label encoding. Finally, I will explain more complex approaches like Weight of Evidence, Hash and Catboost encoding. I will close the talk with summarizing the key takeaways.\n\nTarget Audience:\nData scientists and anyone interested in feature encoding\n\nPrevious experience with feature encoders can be useful but is not mandatory to follow the talk.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1628, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json index 393511f04..041a538a0 100644 --- a/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/andy-kitchen-promptly-evaluating-prompts-with-bayesian-tournaments-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Pick your next hot LLM prompt using a Bayesian tournament! Get a quick LLM dopamine hit with a side of decision theory vegetables. It's Bayesian Thunderdome: many prompts enter, one prompt leaves.\n\nHow do you chose the best LLM prompt systematically beyond guessing and vibes? Use the winner of a Bayesian tournament! Get a quick dopamine hit from fun LLM prompt magic with a side of Bayesian decision theory vegetables. If you are doing stuff with LLMs \u2014 you'll get a serious tool to improve your prompting game. If you're not using LLMs \u2014 you'll learn about Bayesian tournaments. They are not well known but have wide applicability: they help you optimally choose a winner using a minimal number of matches.\n\nBio:\nAndy Kitchen\nI've helped found multiple start-ups, including CorticalLabs an AI+Biotech company working on \"Synthetic Biological Intelligence\". I've co-authored several papers and patents in deep learning and neuroscience. I've made a mess in more than a dozen programming languages over my career. My stack is full. I've worked on custom neural interface hardware to web apps and everything in between. I've won a few hack-a-thons. I started the Machine Learning and AI meetup in Melbourne Australia, helped found & organize the Compose :: Melbourne conference. I have two cats, I scoop their poop most days.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Pick your next hot LLM prompt using a Bayesian tournament! Get a quick LLM dopamine hit with a side of decision theory vegetables. It's Bayesian Thunderdome: many prompts enter, one prompt leaves.\n\nHow do you chose the best LLM prompt systematically beyond guessing and vibes? Use the winner of a Bayesian tournament! Get a quick dopamine hit from fun LLM prompt magic with a side of Bayesian decision theory vegetables. If you are doing stuff with LLMs \u2014 you'll get a serious tool to improve your prompting game. If you're not using LLMs \u2014 you'll learn about Bayesian tournaments. They are not well known but have wide applicability: they help you optimally choose a winner using a minimal number of matches.\n\nBio:\nAndy Kitchen\nI've helped found multiple start-ups, including CorticalLabs an AI+Biotech company working on \"Synthetic Biological Intelligence\". I've co-authored several papers and patents in deep learning and neuroscience. I've made a mess in more than a dozen programming languages over my career. My stack is full. I've worked on custom neural interface hardware to web apps and everything in between. I've won a few hack-a-thons. I started the Machine Learning and AI meetup in Melbourne Australia, helped found & organize the Compose :: Melbourne conference. I have two cats, I scoop their poop most days.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1746, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json index 2a4e774d7..109c797c4 100644 --- a/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/bossan-extend-your-scikit-learn-workflow-with-hugging-face-and-skorch-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Discover how to bridge the gap between traditional machine learning and the rapidly evolving world of AI with skorch. This package integrates the Hugging Face ecosystem while adhering to the familiar scikit-learn API. We will explore fine-turing of pre-trained models, creating our own tokenizers, accelerating model training, and leveraging Large Language Models.\n\nThe machine learning world is evolving quickly, AI is talked about everywhere, with the Hugging Face ecosystem being in the midst of it. For traditional machine learning users, especially coming from scikit-learn, keeping up can be quite overwhelming. With the help of the skorch package, it is possible to marry the best of both worlds. It allows you to integrate with many of the Hugging Face features while conforming to the sklearn API.\n\nIn this talk, I'll give a brief introduction to skorch. Then we will learn how to use it to tap into the Hugging Face ecosystem, benefiting from: using pre-trained models and fine-tuning them, working with tokenizers as if they were sklearn transformers, accelerating model training, and even using Large Language Models as zero-shot classifiers. I'll discuss some benefits and drawbacks of this approach.\n\nThis talk should be of interest to you if you're coming from the scikit-learn world and are interested in the latest deep learning developments. Familiarity with scikit-learn and a little bit of PyTorch knowledge is recommended.\n\nBio:\nBenjamin Bossan\nI worked as a Data Scientist and Head of Data Science for a couple of ears, now I'm Machine Learning Engineer at Hugging Face. I'm also a maintainer of the skorch package (https://github.com/skorch-dev/skorch).\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Discover how to bridge the gap between traditional machine learning and the rapidly evolving world of AI with skorch. This package integrates the Hugging Face ecosystem while adhering to the familiar scikit-learn API. We will explore fine-turing of pre-trained models, creating our own tokenizers, accelerating model training, and leveraging Large Language Models.\n\nThe machine learning world is evolving quickly, AI is talked about everywhere, with the Hugging Face ecosystem being in the midst of it. For traditional machine learning users, especially coming from scikit-learn, keeping up can be quite overwhelming. With the help of the skorch package, it is possible to marry the best of both worlds. It allows you to integrate with many of the Hugging Face features while conforming to the sklearn API.\n\nIn this talk, I'll give a brief introduction to skorch. Then we will learn how to use it to tap into the Hugging Face ecosystem, benefiting from: using pre-trained models and fine-tuning them, working with tokenizers as if they were sklearn transformers, accelerating model training, and even using Large Language Models as zero-shot classifiers. I'll discuss some benefits and drawbacks of this approach.\n\nThis talk should be of interest to you if you're coming from the scikit-learn world and are interested in the latest deep learning developments. Familiarity with scikit-learn and a little bit of PyTorch knowledge is recommended.\n\nBio:\nBenjamin Bossan\nI worked as a Data Scientist and Head of Data Science for a couple of ears, now I'm Machine Learning Engineer at Hugging Face. I'm also a maintainer of the skorch package (https://github.com/skorch-dev/skorch).\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1422, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json index e883f98c8..09ed588ee 100644 --- a/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json +++ b/pydata-amsterdam-2023/videos/building-true-machine-learning-mvps-validating-the-value-chain-as-a-product-data-scientist.json @@ -1,5 +1,5 @@ { - "description": "Some say machine learning projects fail because they live in notebooks.\n\nBut I would bet that even more of them fail because their projects solve a problem that doesn\u2019t exist. Or uses an interface that\u2019s not feasible. In other words, they fail because they don\u2019t validate their underlying assumptions.\n\nProduct analytics helps build models that solve real problems. In my time at ING, I\u2019ve been dealing with a lot of the latter, and I\u2019ll be sharing my thoughts on how to find problems worth solving with data science.\n\nBio:\nAzamat Omuraliev\nAzamat Omuraliev is a Senior Data Scientist at ING. Cracking the problem of personalization since joining ING in 2020! Decided to stay on this topic because it\u2019s a challenge that requires getting many things right: constructing the right kind of machine learning model, staying in touch with customers and handling millions of interactions daily. Thanks to that, still learning something new on the job every single day.\n\nOriginally from Kyrgyzstan, moved to the Netherlands for studies but stayed for friends and for Amsterdam \u2764\ufe0f\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Some say machine learning projects fail because they live in notebooks.\n\nBut I would bet that even more of them fail because their projects solve a problem that doesn\u2019t exist. Or uses an interface that\u2019s not feasible. In other words, they fail because they don\u2019t validate their underlying assumptions.\n\nProduct analytics helps build models that solve real problems. In my time at ING, I\u2019ve been dealing with a lot of the latter, and I\u2019ll be sharing my thoughts on how to find problems worth solving with data science.\n\nBio:\nAzamat Omuraliev\nAzamat Omuraliev is a Senior Data Scientist at ING. Cracking the problem of personalization since joining ING in 2020! Decided to stay on this topic because it\u2019s a challenge that requires getting many things right: constructing the right kind of machine learning model, staying in touch with customers and handling millions of interactions daily. Thanks to that, still learning something new on the job every single day.\n\nOriginally from Kyrgyzstan, moved to the Netherlands for studies but stayed for friends and for Amsterdam \u2764\ufe0f\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1529, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json index 6df38c28c..26188d87d 100644 --- a/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json +++ b/pydata-amsterdam-2023/videos/buso-dohmen-mlops-on-the-fly-optimizing-a-feature-store-with-duckdb-and-arrowflight-pdams-23.json @@ -1,5 +1,5 @@ { - "description": "Feature Stores are a vital part of the MLOps stack for managing machine learning features and ensuring data consistency. This talk introduces Feature Stores and the underlying data management architecture. We\u2019ll then discuss the challenges and learnings of integrating DuckDB and Arrow Flight into the our Feature Store platform, and share benchmarks showing up to 30x speedups compared to Spark/Hive. Discover how DuckDB and ArrowFlight can also speedup your data management and machine learning pipelines.\n\nIn this talk, we will cover the following topics:\n\n\u2022 Introduction to Machine Learning Feature Stores (5 min): Understanding the role of feature stores in the MLOps stack and their significance in managing machine learning features within organizations.\n\u2022 Data management architecture behind Feature Stores (2-3 min): Exploring the underlying mechanisms and data management components employed in feature stores.\n\u2022 Introduction to DuckDB and Arrow Flight (5 min): Highlighting the integration of DuckDB and Arrow Flight into the PyData ecosystem, leveraging the capabilities of Arrow.\n\u2022 The journey of integrating DuckDB and Arrow Flight into our Feature Store platform (12 min): Sharing our experiences and insights on integrating DuckDB and Arrow Flight into the Hudi-based Lakehouse platform that powers our (offline) feature store, discussing challenges and successes encountered along the way.\n\u2022 Benchmarks (5 min): Presenting a benchmark comparing the performance of DuckDB/Arrow Flight vs Spark/HiveServer2, in particular for small to medium sized data.\n\nAttendees will gain a deeper understanding of feature stores, insights into the integration of DuckDB and ArrowFlight into the PyData ecosystem, and practical knowledge on enhancing the performance of machine learning pipelines.\n\nBio:\nFabio Buso\nFabio Buso is VP of Engineering at Hopsworks, leading the Feature Store development team. Fabio holds a master\u2019s degree in Cloud Computing and Services with a focus on data intensive applications.\n\nTill D\u00f6hmen\nTill D\u00f6hmen is a Research Engineer at Hopsworks, where he is contribibuting to the development of Hopswork's Python-centric Feature Store platform. In addition to his work at Hopsworks, he is a guest researcher at the Intelligent Data Engineering Lab of the University of Amsterdam and engages in research at the intersection of data management and machine learning.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Feature Stores are a vital part of the MLOps stack for managing machine learning features and ensuring data consistency. This talk introduces Feature Stores and the underlying data management architecture. We\u2019ll then discuss the challenges and learnings of integrating DuckDB and Arrow Flight into the our Feature Store platform, and share benchmarks showing up to 30x speedups compared to Spark/Hive. Discover how DuckDB and ArrowFlight can also speedup your data management and machine learning pipelines.\n\nIn this talk, we will cover the following topics:\n\n\u2022 Introduction to Machine Learning Feature Stores (5 min): Understanding the role of feature stores in the MLOps stack and their significance in managing machine learning features within organizations.\n\u2022 Data management architecture behind Feature Stores (2-3 min): Exploring the underlying mechanisms and data management components employed in feature stores.\n\u2022 Introduction to DuckDB and Arrow Flight (5 min): Highlighting the integration of DuckDB and Arrow Flight into the PyData ecosystem, leveraging the capabilities of Arrow.\n\u2022 The journey of integrating DuckDB and Arrow Flight into our Feature Store platform (12 min): Sharing our experiences and insights on integrating DuckDB and Arrow Flight into the Hudi-based Lakehouse platform that powers our (offline) feature store, discussing challenges and successes encountered along the way.\n\u2022 Benchmarks (5 min): Presenting a benchmark comparing the performance of DuckDB/Arrow Flight vs Spark/HiveServer2, in particular for small to medium sized data.\n\nAttendees will gain a deeper understanding of feature stores, insights into the integration of DuckDB and ArrowFlight into the PyData ecosystem, and practical knowledge on enhancing the performance of machine learning pipelines.\n\nBio:\nFabio Buso\nFabio Buso is VP of Engineering at Hopsworks, leading the Feature Store development team. Fabio holds a master\u2019s degree in Cloud Computing and Services with a focus on data intensive applications.\n\nTill D\u00f6hmen\nTill D\u00f6hmen is a Research Engineer at Hopsworks, where he is contribibuting to the development of Hopswork's Python-centric Feature Store platform. In addition to his work at Hopsworks, he is a guest researcher at the Intelligent Data Engineering Lab of the University of Amsterdam and engages in research at the intersection of data management and machine learning.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1254, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json index dc973843f..4735a87d5 100644 --- a/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/cikla-zhutovsky-transfer-learning-in-boosting-models-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Did you know that you could do transfer learning on boosted forests too? Even in current days, we face business cases where the modelling sample is very low. This brings an uncertainty to the modelling results and in some cases no ability to model at all. To counter it, we investigated the ability to use transfer learning approaches on boosting models. In this talk, we would like to show the methods used and results from a real case example applied to the credit risk domain.\n\nTransfer learning (TL), a form of machine learning, involves leveraging knowledge acquired while addressing one task and applying it to a related task. While TL is mainly associated with deep learning tasks, it is also applicable to boosting algorithms which are commonly used in advanced credit risk modelling.\n\nDuring the talk, we present a real use-case involving building a probability of default (PD) model for a customer segment with small data history within the bank. There can be several ways to benefit from data coming from other customer segments with already rich data available within the bank.\n\nSimple approaches would be:\n- Fit a model on only rich data & just apply to the limited data\n- Fit a model on both data sets, but tune it on the limited data\n\nMore complex (TL) approaches:\n- Fit a model on rich data with sample weights come from resemblance analysis to calculate similarity between these two data sources.\n- Use refitting with the limited data on the model trained on rich data\n- Start with an initial pre-trained model while modelling on the limited data\n\nJoin us for an engaging session where we will share the outcomes of our experiments and lessons learned, as we address these approaches that hold relevance beyond the presented use-case, offering practical applicability for similar scenarios in your own domain.\n\nBios:\nBusra Cikla\nBusra is an experienced data scientist with passion for analytics at ING\u2019s Risk & Pricing Advanced Analytics Team in Amsterdam. She has designed and developed end-to-end advanced analytics solutions to a business problem in different domains during the last 5 years at ING. Currently, she is working on real-time credit risk models by using ML. Busra has a background on optimisation and operational research from her B.Sc. study and she has M.Sc. degree on Data Science.\n\nPaul Zhutovsky\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Did you know that you could do transfer learning on boosted forests too? Even in current days, we face business cases where the modelling sample is very low. This brings an uncertainty to the modelling results and in some cases no ability to model at all. To counter it, we investigated the ability to use transfer learning approaches on boosting models. In this talk, we would like to show the methods used and results from a real case example applied to the credit risk domain.\n\nTransfer learning (TL), a form of machine learning, involves leveraging knowledge acquired while addressing one task and applying it to a related task. While TL is mainly associated with deep learning tasks, it is also applicable to boosting algorithms which are commonly used in advanced credit risk modelling.\n\nDuring the talk, we present a real use-case involving building a probability of default (PD) model for a customer segment with small data history within the bank. There can be several ways to benefit from data coming from other customer segments with already rich data available within the bank.\n\nSimple approaches would be:\n- Fit a model on only rich data & just apply to the limited data\n- Fit a model on both data sets, but tune it on the limited data\n\nMore complex (TL) approaches:\n- Fit a model on rich data with sample weights come from resemblance analysis to calculate similarity between these two data sources.\n- Use refitting with the limited data on the model trained on rich data\n- Start with an initial pre-trained model while modelling on the limited data\n\nJoin us for an engaging session where we will share the outcomes of our experiments and lessons learned, as we address these approaches that hold relevance beyond the presented use-case, offering practical applicability for similar scenarios in your own domain.\n\nBios:\nBusra Cikla\nBusra is an experienced data scientist with passion for analytics at ING\u2019s Risk & Pricing Advanced Analytics Team in Amsterdam. She has designed and developed end-to-end advanced analytics solutions to a business problem in different domains during the last 5 years at ING. Currently, she is working on real-time credit risk models by using ML. Busra has a background on optimisation and operational research from her B.Sc. study and she has M.Sc. degree on Data Science.\n\nPaul Zhutovsky\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1729, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json index c5d250a0a..fd5d6819a 100644 --- a/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/danial-senejohnny-survival-analysis-a-deep-dive-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Survival analysis was initially introduced to handle the data analysis required in use cases revolving death and treatment in health care. Due to its merit, this method has spread to many other domains for analyzing and modeling the data where the outcome is the time until an event of interest occurs. Domains such as finance, economy, sociology and engineering.\n\nThis talk aims at unraveling the potential of survival analysis with examples from different domains. A taxonomy of the existing descriptive and predictive analytics algorithms in survival analysis are demonstrated. The concept of some candidate algorithms from each group are explained in detail, along with an example and implementation guideline using the right open source framework.\n\nThis talk aims at introducing the tools and techniques within the survival analysis domain for analyzing the time until an event of interest occurs. Examples of such event are rehospitalization after being discharged from hospital (healthcare), device needing maintenance after (re)commissioning (manufacturing), finding a job after unemployment (economy), an asset being sold after listing for sale (real-estate/finance), getting rearrested after being released from prison (criminology/sociology), and many other examples.\n\nThe potential of survival analysis tools, in both descriptive and predictive analytics, are hidden to the data science community. As a result of this, such problems are often formulated as classification or regression, where this also comes with its own caveats and pitfalls.\n\nThe aim of the talk is to simplify methods and algorithms in survival analysis with some shallow mathematical focus and starts by raising awareness about survival analysis and its potential and applications for the general audience. The descriptive and predictive algorithms within survival analysis address the data scientists with basic statistics and machine learning background, as the main audience of the talk.\n\nIntroduction to Survival Analysis\nApplications in different domains\nFormulating Survival Analysis Problem\nTaxonomy of Descriptive & Predictive Methods with python packages\nOverview of Descriptive Methods\n- Kaplan-Meier [3 slide]\n- Nelson-Aalen & Weibull [half slide]\nOverview of Predictive Methods\n- Cox Proportional Hazard [3 slide]\n- Survival Tree & Forrest [1 slide]\n- Deep Survival Analysis [1 slide]\nConclusion\nAt the end of the talk, the audience becomes aware of what survival analysis can do and which algorithms, with their corresponding python package, are the low hanging fruit in a data scientist toolbox. In addition, the audience will gain a structured overview on the topic so that any need for further knowledge acquisition could be independently followed in the future.\n\nBio:\nDanial Senejohnny\nI am a data scientist with a background in applied mathematics (systems & control). In my career as data scientist, I have experienced different sectors, i.e. manufacturing, cybersecurity, healthcare, and finance. Currently, I am contributing to data-driven solutions that improve our clients\u2019 experience and satisfaction within ABN AMRO.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Survival analysis was initially introduced to handle the data analysis required in use cases revolving death and treatment in health care. Due to its merit, this method has spread to many other domains for analyzing and modeling the data where the outcome is the time until an event of interest occurs. Domains such as finance, economy, sociology and engineering.\n\nThis talk aims at unraveling the potential of survival analysis with examples from different domains. A taxonomy of the existing descriptive and predictive analytics algorithms in survival analysis are demonstrated. The concept of some candidate algorithms from each group are explained in detail, along with an example and implementation guideline using the right open source framework.\n\nThis talk aims at introducing the tools and techniques within the survival analysis domain for analyzing the time until an event of interest occurs. Examples of such event are rehospitalization after being discharged from hospital (healthcare), device needing maintenance after (re)commissioning (manufacturing), finding a job after unemployment (economy), an asset being sold after listing for sale (real-estate/finance), getting rearrested after being released from prison (criminology/sociology), and many other examples.\n\nThe potential of survival analysis tools, in both descriptive and predictive analytics, are hidden to the data science community. As a result of this, such problems are often formulated as classification or regression, where this also comes with its own caveats and pitfalls.\n\nThe aim of the talk is to simplify methods and algorithms in survival analysis with some shallow mathematical focus and starts by raising awareness about survival analysis and its potential and applications for the general audience. The descriptive and predictive algorithms within survival analysis address the data scientists with basic statistics and machine learning background, as the main audience of the talk.\n\nIntroduction to Survival Analysis\nApplications in different domains\nFormulating Survival Analysis Problem\nTaxonomy of Descriptive & Predictive Methods with python packages\nOverview of Descriptive Methods\n- Kaplan-Meier [3 slide]\n- Nelson-Aalen & Weibull [half slide]\nOverview of Predictive Methods\n- Cox Proportional Hazard [3 slide]\n- Survival Tree & Forrest [1 slide]\n- Deep Survival Analysis [1 slide]\nConclusion\nAt the end of the talk, the audience becomes aware of what survival analysis can do and which algorithms, with their corresponding python package, are the low hanging fruit in a data scientist toolbox. In addition, the audience will gain a structured overview on the topic so that any need for further knowledge acquisition could be independently followed in the future.\n\nBio:\nDanial Senejohnny\nI am a data scientist with a background in applied mathematics (systems & control). In my career as data scientist, I have experienced different sectors, i.e. manufacturing, cybersecurity, healthcare, and finance. Currently, I am contributing to data-driven solutions that improve our clients\u2019 experience and satisfaction within ABN AMRO.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1386, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json index 0c7dda74f..8de0cc830 100644 --- a/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json +++ b/pydata-amsterdam-2023/videos/daniel-van-der-ende-return-to-data-s-inferno-are-the-7-layers-of-data-testing-hell-still-relevant.json @@ -1,5 +1,5 @@ { - "description": "Back in 2018, a blogpost titled \"Data's Inferno: 7 circles of data testing hell with Airflow\" presented a layered approach to data quality checks in data applications and pipelines. Now, 5 years later, this talk looks back at Data's Inferno and surveys what has changed but also what hasn't in the space of ensuring high data quality.\n\n5 years ago a blog post called \"Data's Inferno\" (https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8) was written about how to ensure high data quality with Apache Airflow. It suggested using different types of tests as layers to catch issues lurking within the data. These layers included tests for Airflow DAG integrity, mock data pipelines, production data tests, and more. Combining these layers made for a reliable way to filter out incorrect data. Despite the blogpost's age, the ideas are still relevant today. New tools and applications have been developed to help improve data quality as well as new best practices. In this talk, we'll review the layers of Data's Inferno and how they contributed to improving data quality. We'll also look at how new tools address the same concerns. Finally, we'll discuss how we expect and hope the data quality landscape to evolve in the future.\n\nBio:\nDaniel van der Ende\nDaniel van der Ende is a Data Engineer at Xebia Data. He enjoys working on high performance distributed computation with Spark, empowering data scientists by helping them to run their models on very large datasets with high performance. He is an Apache Spark and Apache Airflow contributor and speaker at conferences and meetups.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Back in 2018, a blogpost titled \"Data's Inferno: 7 circles of data testing hell with Airflow\" presented a layered approach to data quality checks in data applications and pipelines. Now, 5 years later, this talk looks back at Data's Inferno and surveys what has changed but also what hasn't in the space of ensuring high data quality.\n\n5 years ago a blog post called \"Data's Inferno\" (https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8) was written about how to ensure high data quality with Apache Airflow. It suggested using different types of tests as layers to catch issues lurking within the data. These layers included tests for Airflow DAG integrity, mock data pipelines, production data tests, and more. Combining these layers made for a reliable way to filter out incorrect data. Despite the blogpost's age, the ideas are still relevant today. New tools and applications have been developed to help improve data quality as well as new best practices. In this talk, we'll review the layers of Data's Inferno and how they contributed to improving data quality. We'll also look at how new tools address the same concerns. Finally, we'll discuss how we expect and hope the data quality landscape to evolve in the future.\n\nBio:\nDaniel van der Ende\nDaniel van der Ende is a Data Engineer at Xebia Data. He enjoys working on high performance distributed computation with Spark, empowering data scientists by helping them to run their models on very large datasets with high performance. He is an Apache Spark and Apache Airflow contributor and speaker at conferences and meetups.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1465, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json index e02194306..a0a9d82fb 100644 --- a/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/emeli-dral-mind-the-language-how-to-monitor-nlp-and-llm-in-production-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "How can you evaluate your production models when the data is not structured and you have no labels? To start, by tracking patterns and changes in the input data and model outputs. In this talk, I will give an overview of the possible approaches to monitor NLP and LLM models: from embedding drift detection to using regular expressions.\n\nOnce LLMs or NLP models are in production, you want to ensure they work as intended. But how can you observe their behavior in the wild and detect when something goes wrong?\n\nFirst, you often lack true labels. To add to this, the data is unstructured - how exactly can you track a pile of texts?\n\nMonitoring the patterns in the input data and model outputs is often the first line of defense. In the talk, I will review possible approaches to monitoring drift and data quality issues in text data and explain their pros and cons.\n\nI will cover:\n- Statistical embedding drift detection\n- Tracking interpretable text descriptors like text length and sentiment\n- Using regular expressions to validate outputs\n- Explaining drift through model-based drift detection\n- Detecting changes in multi-modal data\n\nI will also introduce open-source tools, models, and visualization techniques one can use to monitor LLM and NLP models.\n\nThis talk will benefit data scientists and machine learning engineers who work with NLP and LLM in production.\n\nBio:\nEmeli Dral\nEmeli Dral is a Co-founder and CTO at Evidently AI, a startup developing open-source tools to evaluate, test, and monitor the performance of machine learning models.\n\nEarlier, she co-founded an industrial AI startup and served as the Chief Data Scientist at Yandex Data Factory. She led over 50 applied ML projects for various industries - from banking to manufacturing. Emeli is a data science lecturer at GSOM SpBU and Harbour.Space University. She is a co-author of the Machine Learning and Data Analysis curriculum at Coursera with over 100,000 students.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "How can you evaluate your production models when the data is not structured and you have no labels? To start, by tracking patterns and changes in the input data and model outputs. In this talk, I will give an overview of the possible approaches to monitor NLP and LLM models: from embedding drift detection to using regular expressions.\n\nOnce LLMs or NLP models are in production, you want to ensure they work as intended. But how can you observe their behavior in the wild and detect when something goes wrong?\n\nFirst, you often lack true labels. To add to this, the data is unstructured - how exactly can you track a pile of texts?\n\nMonitoring the patterns in the input data and model outputs is often the first line of defense. In the talk, I will review possible approaches to monitoring drift and data quality issues in text data and explain their pros and cons.\n\nI will cover:\n- Statistical embedding drift detection\n- Tracking interpretable text descriptors like text length and sentiment\n- Using regular expressions to validate outputs\n- Explaining drift through model-based drift detection\n- Detecting changes in multi-modal data\n\nI will also introduce open-source tools, models, and visualization techniques one can use to monitor LLM and NLP models.\n\nThis talk will benefit data scientists and machine learning engineers who work with NLP and LLM in production.\n\nBio:\nEmeli Dral\nEmeli Dral is a Co-founder and CTO at Evidently AI, a startup developing open-source tools to evaluate, test, and monitor the performance of machine learning models.\n\nEarlier, she co-founded an industrial AI startup and served as the Chief Data Scientist at Yandex Data Factory. She led over 50 applied ML projects for various industries - from banking to manufacturing. Emeli is a data science lecturer at GSOM SpBU and Harbour.Space University. She is a co-author of the Machine Learning and Data Analysis curriculum at Coursera with over 100,000 students.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1519, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json index 9bd2401bd..00b978ff4 100644 --- a/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json +++ b/pydata-amsterdam-2023/videos/enhancing-economic-outcomes-leveraging-business-metrics-for-machine-learning-model-optimization.json @@ -1,5 +1,5 @@ { - "description": "Optimizing machine learning models using regular metrics is a common practice in the industry. However, aligning model optimization with business metrics is closely tied to the objectives of the business and is highly valued by product managers and other stakeholders. This talk delves into the process of training machine learning models based on business metrics in order to enhance economic outcomes. With a primary focus on data scientists and machine learning practitioners, this talk explores techniques, methodologies, and real-world applications that harness the power of business metrics to propel machine learning models and foster business success. We will present a specific case study that demonstrates how we utilized business metrics at Booking.com that brought significant impact on model performance on business outcomes. Specifically, we will discuss our approaches to leveraging business metrics for hyperparameter tuning and reducing model complexity, which instill greater confidence within our team when deploying improved models to production.\n\nDescription\nThis talk aims to equip data scientists and machine learning practitioners with the knowledge and tools to train machine learning models on business metrics effectively. We will delve into the process of hyperparameter tuning, algorithm selection, and model evaluation specifically tailored for optimizing economic outcomes. A real-world use case at Booking.com will demonstrate the transformative power of this approach.\n\nOutline\n- Introduction to training machine learning models on machine learning metrics versus business metrics\n- Overview of the significance of leveraging business metrics to improve machine learning models' performance on business metrics\n- Introduction to machine learning algorithms suitable for modeling business metrics to drive economic optimizations\n- Metrics and evaluation, and training techniques specific to assessing the business impact of machine learning models\n- Showcasing practical use case at Booking.com where training models on business metrics has led to significant improvements in economic outcomes\n\nCentral Focus\nTraining machine learning models on business metrics present a powerful methodology for optimizing economic outcomes. By incorporating relevant business data and metrics into the modeling process, data scientists and machine learning practitioners can drive substantial improvements in economic performance. This talk will provide attendees with the necessary insights and techniques to apply this approach successfully.\n\nKey Takeaways\n- Understanding the importance of training machine learning models on business metrics for economic optimizations\n- Familiarity with machine learning algorithms suitable for modeling business metrics and driving economic outcomes\n- Strategies for evaluating and quantifying the economic impact of machine learning models\nReal-world inspiration and practical insights for applying this approach to boost economic outcomes\n\nWe aim to deliver an informative and practical talk that caters to data scientists and machine learning practitioners. Attendees will gain actionable insights, methodologies, and real-world examples to effectively train machine learning models on business metrics, leading to enhanced economic outcomes.\n\nBio:\nFelipe Moraes\nI am a machine learning scientist at Booking.com working on personalized discounts under budget constraints.\nI have a PhD in Computer Science from the Delft University of Technology. During my PhD, I interned as an applied scientist at Amazon Alexa Shopping, where I worked on finding proxies for what customers find relevant when comparing products during their search shopping journey in order to empower Amazon recommendation systems. Before that I obtained a BSc and MSc in Computer Science from the Federal University of Minas Gerais, visited research labs at NYU and the University of Quebec, and worked as a software engineer intern in a news recommendation system start up.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Optimizing machine learning models using regular metrics is a common practice in the industry. However, aligning model optimization with business metrics is closely tied to the objectives of the business and is highly valued by product managers and other stakeholders. This talk delves into the process of training machine learning models based on business metrics in order to enhance economic outcomes. With a primary focus on data scientists and machine learning practitioners, this talk explores techniques, methodologies, and real-world applications that harness the power of business metrics to propel machine learning models and foster business success. We will present a specific case study that demonstrates how we utilized business metrics at Booking.com that brought significant impact on model performance on business outcomes. Specifically, we will discuss our approaches to leveraging business metrics for hyperparameter tuning and reducing model complexity, which instill greater confidence within our team when deploying improved models to production.\n\nDescription\nThis talk aims to equip data scientists and machine learning practitioners with the knowledge and tools to train machine learning models on business metrics effectively. We will delve into the process of hyperparameter tuning, algorithm selection, and model evaluation specifically tailored for optimizing economic outcomes. A real-world use case at Booking.com will demonstrate the transformative power of this approach.\n\nOutline\n- Introduction to training machine learning models on machine learning metrics versus business metrics\n- Overview of the significance of leveraging business metrics to improve machine learning models' performance on business metrics\n- Introduction to machine learning algorithms suitable for modeling business metrics to drive economic optimizations\n- Metrics and evaluation, and training techniques specific to assessing the business impact of machine learning models\n- Showcasing practical use case at Booking.com where training models on business metrics has led to significant improvements in economic outcomes\n\nCentral Focus\nTraining machine learning models on business metrics present a powerful methodology for optimizing economic outcomes. By incorporating relevant business data and metrics into the modeling process, data scientists and machine learning practitioners can drive substantial improvements in economic performance. This talk will provide attendees with the necessary insights and techniques to apply this approach successfully.\n\nKey Takeaways\n- Understanding the importance of training machine learning models on business metrics for economic optimizations\n- Familiarity with machine learning algorithms suitable for modeling business metrics and driving economic outcomes\n- Strategies for evaluating and quantifying the economic impact of machine learning models\nReal-world inspiration and practical insights for applying this approach to boost economic outcomes\n\nWe aim to deliver an informative and practical talk that caters to data scientists and machine learning practitioners. Attendees will gain actionable insights, methodologies, and real-world examples to effectively train machine learning models on business metrics, leading to enhanced economic outcomes.\n\nBio:\nFelipe Moraes\nI am a machine learning scientist at Booking.com working on personalized discounts under budget constraints.\nI have a PhD in Computer Science from the Delft University of Technology. During my PhD, I interned as an applied scientist at Amazon Alexa Shopping, where I worked on finding proxies for what customers find relevant when comparing products during their search shopping journey in order to empower Amazon recommendation systems. Before that I obtained a BSc and MSc in Computer Science from the Federal University of Minas Gerais, visited research labs at NYU and the University of Quebec, and worked as a software engineer intern in a news recommendation system start up.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1284, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json index 0261c6d6f..cef0532ee 100644 --- a/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/fokko-driesprong-pyiceberg-tipping-your-toes-into-the-petabyte-data-lake-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "With Apache Iceberg, you store your big data in the cloud as files (e.g., Parquet), but then query it as if it\u2019s a plain SQL table. You enjoy the endless scalability of the cloud, without having to worry about how to store, partition, or query your data efficiently. PyIceberg is the Python implementation of Apache Iceberg that loads your Iceberg tables into PyArrow (pandas), DuckDB, or any of your preferred engines for doing data science. This means that with PyIceberg, you can tap into big data easily by only using Python. It\u2019s time to say goodbye to the ancient Hadoop-based frameworks of the past! In this talk, you'll learn why you need Iceberg, how to use it, and why it is so fast.\n\nDescription: Working with high volumes of data has always been complex and challenging. Querying data with Spark requires you to know how the data is partitioned, otherwise, your query performance suffers tremendously. The Apache Iceberg open table format fixes this by fixing the underlying storage, instead of by educating the end users. Iceberg originated at Netflix and provides a cloud-native layer on top of your data files. It solves traditional issues regarding correctness by supporting concurrent reading and writing to the table. Iceberg improves performance dramatically by collecting metrics on the data, having the ability to easily repartition your data, and being able to compact the underlying data. Finally, it supports time travel, so the model that you're training doesn't change because new data has been added. After this talk, you'll be comfortable using Apache Iceberg.\n\nMinutes 0-5: History and why we need a table format\nMinutes 5-15: Overview of Iceberg, and how it works under the hood\nMinutes 15-30: Introduction to PyIceberg with code and real examples (notebook!!)\n\nBio:\nFokko Driesprong\nOpen Source enthousiast. Committer on Avro, Parquet, Druid, Airflow and Iceberg. Apache Software Foundation members.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "With Apache Iceberg, you store your big data in the cloud as files (e.g., Parquet), but then query it as if it\u2019s a plain SQL table. You enjoy the endless scalability of the cloud, without having to worry about how to store, partition, or query your data efficiently. PyIceberg is the Python implementation of Apache Iceberg that loads your Iceberg tables into PyArrow (pandas), DuckDB, or any of your preferred engines for doing data science. This means that with PyIceberg, you can tap into big data easily by only using Python. It\u2019s time to say goodbye to the ancient Hadoop-based frameworks of the past! In this talk, you'll learn why you need Iceberg, how to use it, and why it is so fast.\n\nDescription: Working with high volumes of data has always been complex and challenging. Querying data with Spark requires you to know how the data is partitioned, otherwise, your query performance suffers tremendously. The Apache Iceberg open table format fixes this by fixing the underlying storage, instead of by educating the end users. Iceberg originated at Netflix and provides a cloud-native layer on top of your data files. It solves traditional issues regarding correctness by supporting concurrent reading and writing to the table. Iceberg improves performance dramatically by collecting metrics on the data, having the ability to easily repartition your data, and being able to compact the underlying data. Finally, it supports time travel, so the model that you're training doesn't change because new data has been added. After this talk, you'll be comfortable using Apache Iceberg.\n\nMinutes 0-5: History and why we need a table format\nMinutes 5-15: Overview of Iceberg, and how it works under the hood\nMinutes 15-30: Introduction to PyIceberg with code and real examples (notebook!!)\n\nBio:\nFokko Driesprong\nOpen Source enthousiast. Committer on Avro, Parquet, Druid, Airflow and Iceberg. Apache Software Foundation members.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1289, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json index 4d4ad7445..03c49a5d8 100644 --- a/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json +++ b/pydata-amsterdam-2023/videos/forecasting-customer-lifetime-value-cltv-for-marketing-campaigns-under-uncertainty-with-pystan.json @@ -1,5 +1,5 @@ { - "description": "In this talk, we discuss how we can use the python package PySTAN to estimate the Lifetime Value (LTV) of the users that can be acquired from a marketing campaign, and use this estimate to find the optimal bidding strategy when the LTV estimate itself has uncertainty. Throughout the presentation, we highlight the benefits from using Bayesian modeling to estimate LTV, and the potential pitfalls when forecasting LTV. By the end of the presentation, attendees will have a solid understanding of how to use PySTAN to estimate LTV, optimize their marketing campaign bidding strategies, and implement the best Bayesian modelling solution. All of the contents and numbers in this presentation can be found in the shared GIT\n\nWe describe how to use the PySTAN to forecast the LTV of the marketing campaigns. PySTAN is a Python interface to STAN, which is a package for Bayesian inference capable of high-performance statistical computation. PySTAN\u2019s computation speed is essential in a marketing context, where we need to predict the LTV of multiple marketing campaigns over a long period, while still estimating the LTV distribution. We demonstrate how to implement a PySTAN model to predict a time-series using the Lifetime Value data from Kaggle [2], which contains approximately 200 days, in less than 2 minutes.\nWe then compare how we can achieve the exact same model with PyMC, another well-known probabilistic modelling library, and in which situations and conditions PySTAN outperforms PyMC.\n\nWith the LTV accurately predicted for the Lifetime Value data, we explain the steps to optimize the bid of marketing campaigns under uncertainty about the accuracy of our predictions. We show how different levels of uncertainty of our LTV predictions can change the optimal bidding strategy and answer questions such as \u201cHow much should we underbid when we are unsure of our LTV?\u201d.\nBy the end of the presentation, attendees will be able to implement PySTAN or PyMC to estimate LTV, know which of these two libraries is most appropriate for their needs, and apply this knowledge to find the best bidding strategy for their marketing campaigns.\n\nIn this presentation, we will thus cover the following topics:\n\nIntroduction to digital advertisement\n- Modelling advertisement for digital products\n- How to find the optimal bid for your marketing campaign\n- The role that uncertainty on the estimated LTV plays in your marketing strategy\n\nForecasting LTV with PySTAN\n- What is PySTAN\n- How to use PySTAN to estimate the LTV of your marketing campaigns\n- How to achieve the same model through PyMC\n- Comparison between PySTAN and PyMC\n\nReferences\n- The Duopoly is over because everything is an ad network[ [https://mobiledevmemo.com/the-duopoly-is-over-because-everything-is-an-ad-network/]\n- Lifetime Value data from Kaggle: https://www.kaggle.com/datasets/baetulo/lifetime-value?select=train.csv\n- Why Uncertainty Matters when forecasting Lifetime Value: https://raphaeltamaki.github.io/raphaeltamaki/posts/Forecasting%20Customer%20Lifetime%20Value%20-%20Why%20Uncertainty%20Matters/\n\nBio:\nRaphael de Brito Tamaki\nData Science Lead in the Marketing Science @Meta, where I use causal inference techniques to extract insights to help advertisers increase their marketing performance. Prior to joining Meta, I worked at Wildlife Studios - a mobile game studio with over 2B total downloads - where I was the Tech Lead for the Lifetime Value (LTV) prediction team, and implemented and maintained LTV models in production for over 10 games\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In this talk, we discuss how we can use the python package PySTAN to estimate the Lifetime Value (LTV) of the users that can be acquired from a marketing campaign, and use this estimate to find the optimal bidding strategy when the LTV estimate itself has uncertainty. Throughout the presentation, we highlight the benefits from using Bayesian modeling to estimate LTV, and the potential pitfalls when forecasting LTV. By the end of the presentation, attendees will have a solid understanding of how to use PySTAN to estimate LTV, optimize their marketing campaign bidding strategies, and implement the best Bayesian modelling solution. All of the contents and numbers in this presentation can be found in the shared GIT\n\nWe describe how to use the PySTAN to forecast the LTV of the marketing campaigns. PySTAN is a Python interface to STAN, which is a package for Bayesian inference capable of high-performance statistical computation. PySTAN\u2019s computation speed is essential in a marketing context, where we need to predict the LTV of multiple marketing campaigns over a long period, while still estimating the LTV distribution. We demonstrate how to implement a PySTAN model to predict a time-series using the Lifetime Value data from Kaggle [2], which contains approximately 200 days, in less than 2 minutes.\nWe then compare how we can achieve the exact same model with PyMC, another well-known probabilistic modelling library, and in which situations and conditions PySTAN outperforms PyMC.\n\nWith the LTV accurately predicted for the Lifetime Value data, we explain the steps to optimize the bid of marketing campaigns under uncertainty about the accuracy of our predictions. We show how different levels of uncertainty of our LTV predictions can change the optimal bidding strategy and answer questions such as \u201cHow much should we underbid when we are unsure of our LTV?\u201d.\nBy the end of the presentation, attendees will be able to implement PySTAN or PyMC to estimate LTV, know which of these two libraries is most appropriate for their needs, and apply this knowledge to find the best bidding strategy for their marketing campaigns.\n\nIn this presentation, we will thus cover the following topics:\n\nIntroduction to digital advertisement\n- Modelling advertisement for digital products\n- How to find the optimal bid for your marketing campaign\n- The role that uncertainty on the estimated LTV plays in your marketing strategy\n\nForecasting LTV with PySTAN\n- What is PySTAN\n- How to use PySTAN to estimate the LTV of your marketing campaigns\n- How to achieve the same model through PyMC\n- Comparison between PySTAN and PyMC\n\nReferences\n- The Duopoly is over because everything is an ad network[ [https://mobiledevmemo.com/the-duopoly-is-over-because-everything-is-an-ad-network/]\n- Lifetime Value data from Kaggle: https://www.kaggle.com/datasets/baetulo/lifetime-value?select=train.csv\n- Why Uncertainty Matters when forecasting Lifetime Value: https://raphaeltamaki.github.io/raphaeltamaki/posts/Forecasting%20Customer%20Lifetime%20Value%20-%20Why%20Uncertainty%20Matters/\n\nBio:\nRaphael de Brito Tamaki\nData Science Lead in the Marketing Science @Meta, where I use causal inference techniques to extract insights to help advertisers increase their marketing performance. Prior to joining Meta, I worked at Wildlife Studios - a mobile game studio with over 2B total downloads - where I was the Tech Lead for the Lifetime Value (LTV) prediction team, and implemented and maintained LTV models in production for over 10 games\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1721, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json index bddbae400..43c7f046f 100644 --- a/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/francesco-bruzzesi-bayesian-ranking-for-tennis-players-in-pymc-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "In this talk, we will explore the Bayesian Bradley Terry model implemented in PyMC. We will focus on its application for ranking tennis players, demonstrating how this probabilistic approach can provide an accurate and robust rankings, arguably better than the ATP ranking itself and the Elo rating system.\n\nBy leveraging the power of Bayesian statistics, we can incorporate prior knowledge, handle uncertainty, and make better inferences about player abilities. Join us to learn how to implement the Bayesian Bradley Terry model in PyMC and discover its advantages for ranking tennis players.\n\nThe Bradley Terry model is a powerful model to predict the outcome of a paired comparison, as a by-product we will be able to rank players based on their hidden (latent) ability scores. Traditionally, rankings have been based on simple win-loss records, which may not capture the true abilities of players due to variations in competition quality and sample size. By adopting a Bayesian framework, we can overcome these limitations and obtain more reliable rankings.\n\nIn this talk, we will introduce the Bayesian Bradley Terry model and its underlying principles. We will explore how to encode the model in Python using the PyMC library. We will walk through the step-by-step implementation, highlighting key considerations and practical tips.\n\nTo illustrate the model's effectiveness, we will showcase its application to ranking tennis players, and compare it with both the official ATP ranking and the ELO ranking system. Tennis provides an ideal domain for this analysis, as it involves head-to-head matches between players, allowing us to directly compare their abilities. By applying the Bayesian Bradley Terry model to historical tennis match data, we can generate rankings that better reflect players' true skills, accounting for factors such as opponent strength and match surface.\n\nThroughout the talk, we will emphasize a hands-on approach, providing code examples and demonstrations. Attendees will gain a solid understanding of the model, learn how to implement it using PyMC, a practical application, possible extensions and maybe a few PyMC tricks along the way.\n\nOutline\nWhat's wrong with current tennis ranking.\nIntroduction to the Bayesian Bradley Terry model.\nImplementation of the model in PyMC.\nApplication to ranking tennis players by latent ability score.\nComparison with ATP ranking and ELO rating system.\nPossible extensions and other applications.\n\nBio:\nFrancesco Bruzzesi\nData scientist at HelloFresh with a background in pure mathematics.\nOpen source enthusiast and ML practitioner.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In this talk, we will explore the Bayesian Bradley Terry model implemented in PyMC. We will focus on its application for ranking tennis players, demonstrating how this probabilistic approach can provide an accurate and robust rankings, arguably better than the ATP ranking itself and the Elo rating system.\n\nBy leveraging the power of Bayesian statistics, we can incorporate prior knowledge, handle uncertainty, and make better inferences about player abilities. Join us to learn how to implement the Bayesian Bradley Terry model in PyMC and discover its advantages for ranking tennis players.\n\nThe Bradley Terry model is a powerful model to predict the outcome of a paired comparison, as a by-product we will be able to rank players based on their hidden (latent) ability scores. Traditionally, rankings have been based on simple win-loss records, which may not capture the true abilities of players due to variations in competition quality and sample size. By adopting a Bayesian framework, we can overcome these limitations and obtain more reliable rankings.\n\nIn this talk, we will introduce the Bayesian Bradley Terry model and its underlying principles. We will explore how to encode the model in Python using the PyMC library. We will walk through the step-by-step implementation, highlighting key considerations and practical tips.\n\nTo illustrate the model's effectiveness, we will showcase its application to ranking tennis players, and compare it with both the official ATP ranking and the ELO ranking system. Tennis provides an ideal domain for this analysis, as it involves head-to-head matches between players, allowing us to directly compare their abilities. By applying the Bayesian Bradley Terry model to historical tennis match data, we can generate rankings that better reflect players' true skills, accounting for factors such as opponent strength and match surface.\n\nThroughout the talk, we will emphasize a hands-on approach, providing code examples and demonstrations. Attendees will gain a solid understanding of the model, learn how to implement it using PyMC, a practical application, possible extensions and maybe a few PyMC tricks along the way.\n\nOutline\nWhat's wrong with current tennis ranking.\nIntroduction to the Bayesian Bradley Terry model.\nImplementation of the model in PyMC.\nApplication to ranking tennis players by latent ability score.\nComparison with ATP ranking and ELO rating system.\nPossible extensions and other applications.\n\nBio:\nFrancesco Bruzzesi\nData scientist at HelloFresh with a background in pure mathematics.\nOpen source enthusiast and ML practitioner.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1513, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json index 2e7ad7bd7..9f0b641ac 100644 --- a/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/from-vision-to-action-designing-and-deploying-effective-computer-vision-pipelines-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "In the world of computer vision, the focus is often on cutting-edge neural network architectures. However, the true impact usually lies in designing a robust system around the model to solve real-world business challenges. In this talk, we guide you through the process of building practical computer vision pipelines that leverage techniques such as segmentation, classification, and object tracking, demonstrated by our predictive maintenance application at Port of Rotterdam. Whether you're an experienced expert seeking production-worthy pipelines or a novice with a background in data science or engineering eager to dive into image and video processing, we will explore the use of open-source tools to develop and deploy computer vision applications.\n\nThis talk provides a comprehensive demonstration of creating a powerful computer vision pipeline using widely-used libraries such as PyTorch, torchvision, and OpenCV. We break the pipeline down into manageable components, discussing the importance of proper separation of concerns. Onboarding new use cases becomes a breeze when following best practices in the project structure, combined with user-friendly command-line interfaces. Efficient development and validation processes are ensured by designing a sane data model and writing useful tests. Additionally, we explore the critical topic of maintainability, applying MLOps principles for long-term success.\n\nTo bring these concepts to life, we present a real-world application: the Machine Learning Inspector. This predictive maintenance tool, deployed at the Port of Rotterdam, automatically detects and inspects objects in video streams from trucks and ships, delivering actionable insights. We discuss how we work together with asset inspectors to capture their knowledge of the real world in our artificially intelligent computer vision tool.\n\nJoin us in this talk to gain practical knowledge and valuable insights for designing, deploying, and maintaining computer vision pipelines that drive tangible impact. We aim to empower the audience to build their own computer vision pipelines; with the right design philosophy, every data professional should be able to build computer vision pipelines that might be complex, but not complicated.\n\nBio:\nWesley Boelrijk\nWesley is the Lead Machine Learning Engineer at Xccelerated (part of Xebia). There, he trains and guides junior-to-medior ML Engineers in Xccelerated's one-year program. Besides that, he works as an MLE on various projects, recently at KLM, ProRail, and Port of Rotterdam. In his free time, he likes to stay up-to-date in the ML ecosystem and play around with computer vision.\n\nJeroen Rombouts\nJeroen is an expert in machine learning and AI, specializing in transforming ideas and proof-of-concepts into value-driven products. Leveraging deep expertise in data science and engineering, he offers practical solutions to enhance machine learning infrastructure and elevate data teams' AI skills.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In the world of computer vision, the focus is often on cutting-edge neural network architectures. However, the true impact usually lies in designing a robust system around the model to solve real-world business challenges. In this talk, we guide you through the process of building practical computer vision pipelines that leverage techniques such as segmentation, classification, and object tracking, demonstrated by our predictive maintenance application at Port of Rotterdam. Whether you're an experienced expert seeking production-worthy pipelines or a novice with a background in data science or engineering eager to dive into image and video processing, we will explore the use of open-source tools to develop and deploy computer vision applications.\n\nThis talk provides a comprehensive demonstration of creating a powerful computer vision pipeline using widely-used libraries such as PyTorch, torchvision, and OpenCV. We break the pipeline down into manageable components, discussing the importance of proper separation of concerns. Onboarding new use cases becomes a breeze when following best practices in the project structure, combined with user-friendly command-line interfaces. Efficient development and validation processes are ensured by designing a sane data model and writing useful tests. Additionally, we explore the critical topic of maintainability, applying MLOps principles for long-term success.\n\nTo bring these concepts to life, we present a real-world application: the Machine Learning Inspector. This predictive maintenance tool, deployed at the Port of Rotterdam, automatically detects and inspects objects in video streams from trucks and ships, delivering actionable insights. We discuss how we work together with asset inspectors to capture their knowledge of the real world in our artificially intelligent computer vision tool.\n\nJoin us in this talk to gain practical knowledge and valuable insights for designing, deploying, and maintaining computer vision pipelines that drive tangible impact. We aim to empower the audience to build their own computer vision pipelines; with the right design philosophy, every data professional should be able to build computer vision pipelines that might be complex, but not complicated.\n\nBio:\nWesley Boelrijk\nWesley is the Lead Machine Learning Engineer at Xccelerated (part of Xebia). There, he trains and guides junior-to-medior ML Engineers in Xccelerated's one-year program. Besides that, he works as an MLE on various projects, recently at KLM, ProRail, and Port of Rotterdam. In his free time, he likes to stay up-to-date in the ML ecosystem and play around with computer vision.\n\nJeroen Rombouts\nJeroen is an expert in machine learning and AI, specializing in transforming ideas and proof-of-concepts into value-driven products. Leveraging deep expertise in data science and engineering, he offers practical solutions to enhance machine learning infrastructure and elevate data teams' AI skills.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1537, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json index 159bcc7fe..cf3e47cdd 100644 --- a/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json +++ b/pydata-amsterdam-2023/videos/hadi-abdi-khojasteh-distillation-unleashed-domain-knowledge-transfer-with-compact-neural-networks.json @@ -1,5 +1,5 @@ { - "description": "This talk explores distillation learning, a powerful technique for compressing and transferring knowledge from larger neural networks to smaller, more efficient ones. It delves into its core components and various applications such as model compression and transfer learning. The speaker aims to simplify the topic for all audiences and provides implementation, demonstrating how to apply distillation learning in real scenarios. Attendees will gain insights into developing efficient neural networks by reviewing the various examples of the complex model. The material will be accessible online for convenient access and understanding.\n\nAs the field of artificial intelligence continues to advance, the demand for more efficient and compact neural network models has become increasingly vital. The ability to compress and transfer knowledge from larger, complex models to smaller, more efficient models has emerged as a powerful solution. In this talk, we aim to shed light on the significance of distillation learning and its applications across various domains.\n\nIn an era where data sizes and computational requirements are escalating, distillation learning provides a compelling solution to address the challenges posed by these factors. By utilizing a teacher-student framework, this approach facilitates the transfer of knowledge from a larger, well-performing teacher model to a smaller student model. The student model is trained to mimic the behaviour and output of the teacher model, thereby inheriting its expertise. This process enables the creation of compact models that are not only efficient in terms of memory and inference speed but also capable of performing tasks with comparable proficiency. Distillation learning represents a breakthrough in model compression and transfer learning, revolutionizing the field of artificial intelligence and novel machine learning utilising deep neural networks.\n\nIn this talk, we will provide a comprehensive overview of distillation learning, covering its core components. We will explore the definition and motivation behind, highlighting the role of the teacher model in guiding the student model and the objective of the student model to replicate the teacher model's output. Additionally, we will discuss the diverse applications, including model compression, transfer learning, ensemble learning, multi-task learning, and language models. We will also delve into different types of this learning approach, such as model distillation, knowledge distillation, multi-task distillation, and transfer distillation.\n\nThis talk facilitates knowledge exchange and inspires the development of efficient neural networks. The speaker simplifies the topic, making it accessible to all audiences. Simple practical implementation in TensorFlow will be demonstrated, showcasing how attendees can apply this technique in real scenarios. No expertise in complex models is required, and the material will be shared online for convenient access and comprehension.\n\nBio:\nHadi Abdi Khojasteh\nHadi is an R&D senior machine learning engineer at the Deltatre group, where he is an integral member of the innovation lab and a fellow at the Sport Experiences unit, based in Czechia and Italy. With a solid academic background, Hadi is a former lecturer at the Institute for Advanced Studies in Basic Sciences (IASBS) in Iran and as a researcher at the Institute of Formal and Applied Linguistics (\u00daFAL) at Charles University in Prague. Throughout his career, he has actively participated in numerous industrial projects, collaborating closely with renowned experts in the fields of CV/NLP/HLT/CL/ML/DL. His research focuses on multimodal learning inspired by neural models that are both linguistically motivated and tailored to language and vision, visual reasoning and deep learning. His main research interests are Machine Learning, Deep Learning, Computer Vision, Multimodal Learning and Visual Reasoning while he is experienced in a wide variety of international projects on cutting-edge technologies.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.", + "description": "This talk explores distillation learning, a powerful technique for compressing and transferring knowledge from larger neural networks to smaller, more efficient ones. It delves into its core components and various applications such as model compression and transfer learning. The speaker aims to simplify the topic for all audiences and provides implementation, demonstrating how to apply distillation learning in real scenarios. Attendees will gain insights into developing efficient neural networks by reviewing the various examples of the complex model. The material will be accessible online for convenient access and understanding.\n\nAs the field of artificial intelligence continues to advance, the demand for more efficient and compact neural network models has become increasingly vital. The ability to compress and transfer knowledge from larger, complex models to smaller, more efficient models has emerged as a powerful solution. In this talk, we aim to shed light on the significance of distillation learning and its applications across various domains.\n\nIn an era where data sizes and computational requirements are escalating, distillation learning provides a compelling solution to address the challenges posed by these factors. By utilizing a teacher-student framework, this approach facilitates the transfer of knowledge from a larger, well-performing teacher model to a smaller student model. The student model is trained to mimic the behaviour and output of the teacher model, thereby inheriting its expertise. This process enables the creation of compact models that are not only efficient in terms of memory and inference speed but also capable of performing tasks with comparable proficiency. Distillation learning represents a breakthrough in model compression and transfer learning, revolutionizing the field of artificial intelligence and novel machine learning utilising deep neural networks.\n\nIn this talk, we will provide a comprehensive overview of distillation learning, covering its core components. We will explore the definition and motivation behind, highlighting the role of the teacher model in guiding the student model and the objective of the student model to replicate the teacher model's output. Additionally, we will discuss the diverse applications, including model compression, transfer learning, ensemble learning, multi-task learning, and language models. We will also delve into different types of this learning approach, such as model distillation, knowledge distillation, multi-task distillation, and transfer distillation.\n\nThis talk facilitates knowledge exchange and inspires the development of efficient neural networks. The speaker simplifies the topic, making it accessible to all audiences. Simple practical implementation in TensorFlow will be demonstrated, showcasing how attendees can apply this technique in real scenarios. No expertise in complex models is required, and the material will be shared online for convenient access and comprehension.\n\nBio:\nHadi Abdi Khojasteh\nHadi is an R&D senior machine learning engineer at the Deltatre group, where he is an integral member of the innovation lab and a fellow at the Sport Experiences unit, based in Czechia and Italy. With a solid academic background, Hadi is a former lecturer at the Institute for Advanced Studies in Basic Sciences (IASBS) in Iran and as a researcher at the Institute of Formal and Applied Linguistics (\u00daFAL) at Charles University in Prague. Throughout his career, he has actively participated in numerous industrial projects, collaborating closely with renowned experts in the fields of CV/NLP/HLT/CL/ML/DL. His research focuses on multimodal learning inspired by neural models that are both linguistically motivated and tailored to language and vision, visual reasoning and deep learning. His main research interests are Machine Learning, Deep Learning, Computer Vision, Multimodal Learning and Visual Reasoning while he is experienced in a wide variety of international projects on cutting-edge technologies.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.", "duration": 1314, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json index f57e45801..30bbc6e58 100644 --- a/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json +++ b/pydata-amsterdam-2023/videos/harnessing-uncertainty-the-role-of-probabilistic-time-series-forecasting-in-the-renewable-energy.json @@ -1,5 +1,5 @@ { - "description": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy transition\n\nHow can probabilistic forecasting accelerate the renewable energy transition? The rapid growth of non-steerable and intermittent wind and solar power requires accurate forecasts and the ability to plan under uncertainty. In this talk, we will make a case for using probabilistic forecasts over deterministic forecasts. We will cover methods for generating and evaluating probabilistic forecasts, and discuss how probabilistic price and wind power forecasts can be combined to derive optimal short-term power trading strategies.\n\nBio:\nAlexander Backus\nAlexander is Data Science Manager at Dexter Energy, where he is currently leading the development of machine learning-powered short-term power trading optimization products. He brings extensive hands-on machine learning engineering and data science management experience from various industries, including organizations such as KLM Royal Dutch Airlines, ING Bank, Heineken, VodafoneZiggo and IKEA.\n\n ===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Harnessing uncertainty: the role of probabilistic time series forecasting in the renewable energy transition\n\nHow can probabilistic forecasting accelerate the renewable energy transition? The rapid growth of non-steerable and intermittent wind and solar power requires accurate forecasts and the ability to plan under uncertainty. In this talk, we will make a case for using probabilistic forecasts over deterministic forecasts. We will cover methods for generating and evaluating probabilistic forecasts, and discuss how probabilistic price and wind power forecasts can be combined to derive optimal short-term power trading strategies.\n\nBio:\nAlexander Backus\nAlexander is Data Science Manager at Dexter Energy, where he is currently leading the development of machine learning-powered short-term power trading optimization products. He brings extensive hands-on machine learning engineering and data science management experience from various industries, including organizations such as KLM Royal Dutch Airlines, ING Bank, Heineken, VodafoneZiggo and IKEA.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1575, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json index e5799436f..e117b2b53 100644 --- a/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json +++ b/pydata-amsterdam-2023/videos/hugging-face-processing-billions-of-tokens-for-training-large-language-models-tools-and-knowledge.json @@ -1,5 +1,5 @@ { - "description": "Keynote by Thomas Wolf. He will be accompanied on stage by Alessandro Cappelli, Julien Launay & Guilherme Penedo, all members of the Hugging Face team in Amsterdam working on large model training.\n\nBio:\nThomas Wolf\nThomas Wolf is a co-founder and Chief Science Officer at Hugging Face. He is passionate about creating open-source software that makes complex research accessible, and most proud of creating the Transformers and Datasets libraries as well as the Magic-Sand tool. When he\u2019s not building OSS, he pushes for open-science in research in AI/ML, trying to lower the gap between academia and industrial labs. His current research interests are centered around overcoming the current limitations of LLMs with multi-modalities and complementary approaches.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Keynote by Thomas Wolf. He will be accompanied on stage by Alessandro Cappelli, Julien Launay & Guilherme Penedo, all members of the Hugging Face team in Amsterdam working on large model training.\n\nBio:\nThomas Wolf\nThomas Wolf is a co-founder and Chief Science Officer at Hugging Face. He is passionate about creating open-source software that makes complex research accessible, and most proud of creating the Transformers and Datasets libraries as well as the Magic-Sand tool. When he\u2019s not building OSS, he pushes for open-science in research in AI/ML, trying to lower the gap between academia and industrial labs. His current research interests are centered around overcoming the current limitations of LLMs with multi-modalities and complementary approaches.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 2811, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json index f983a4aec..516287454 100644 --- a/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/jakob-willisch-the-proof-of-the-pudding-is-in-the-way-of-eating-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "The proof of the pudding is in the (way of) eating: quasi-experimental methods of causal inference and their practical pitfalls\n\nData scientists and analysts are using quasi-experimental methods to make recommendations based on causality instead of randomized control trials. While these methods are easy to use, their assumptions can be complex to explain. This talk will explain these assumptions for data scientists and analysts without in-depth training of causal inference so they can use and explain these methods more confidently to change people's minds using data.\n\nInstead of relying solely on randomized control trials (also known as A/B tests), which are considered the gold standard for inferring causality, data scientists and analysts are increasingly turning to quasi-experimental methods to make recommendations based on causality. These methods, including open-source libraries such as CausalImpact (originally an R package but with numerous Python ports), are easy to use, but their assumptions can be complex to explain. I will break down these assumptions and explain how they can help practitioners determine when to use these methods (and when not to use them), using examples from the world of digital language learning. The key takeaway is that when it comes to changing people's minds using data, explaining assumptions to decision-makers is just as important as understanding the underlying statistics.\n\nOutline\n- Minute 0-5: Introduction and Motivation\n- Minute 5-10: Difference-in-Difference / Bayesian Structural Time-Series\n- Minute 10-15: Case - Conversion effects of content changes based language-pair specific releases at Babbel\n- Minute 15-20: Regression Discontinuity Design\n- Minute 20-25: Case - Estimating motivational effects of language assessment\n- Minute 25-30: Wrap-up / Take-Aways\n\nBio:\nJakob Willisch\nAs Head of Product Data at Babbel, I lead data-scientists, analysts and engineers to improve decision-making of people and machines. Before joining Babbel I did quantitative research in Political Science and Political Economy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "The proof of the pudding is in the (way of) eating: quasi-experimental methods of causal inference and their practical pitfalls\n\nData scientists and analysts are using quasi-experimental methods to make recommendations based on causality instead of randomized control trials. While these methods are easy to use, their assumptions can be complex to explain. This talk will explain these assumptions for data scientists and analysts without in-depth training of causal inference so they can use and explain these methods more confidently to change people's minds using data.\n\nInstead of relying solely on randomized control trials (also known as A/B tests), which are considered the gold standard for inferring causality, data scientists and analysts are increasingly turning to quasi-experimental methods to make recommendations based on causality. These methods, including open-source libraries such as CausalImpact (originally an R package but with numerous Python ports), are easy to use, but their assumptions can be complex to explain. I will break down these assumptions and explain how they can help practitioners determine when to use these methods (and when not to use them), using examples from the world of digital language learning. The key takeaway is that when it comes to changing people's minds using data, explaining assumptions to decision-makers is just as important as understanding the underlying statistics.\n\nOutline\n- Minute 0-5: Introduction and Motivation\n- Minute 5-10: Difference-in-Difference / Bayesian Structural Time-Series\n- Minute 10-15: Case - Conversion effects of content changes based language-pair specific releases at Babbel\n- Minute 15-20: Regression Discontinuity Design\n- Minute 20-25: Case - Estimating motivational effects of language assessment\n- Minute 25-30: Wrap-up / Take-Aways\n\nBio:\nJakob Willisch\nAs Head of Product Data at Babbel, I lead data-scientists, analysts and engineers to improve decision-making of people and machines. Before joining Babbel I did quantitative research in Political Science and Political Economy.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1454, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json index fcfcd30ac..69231bf5c 100644 --- a/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/james-powell-cumulative-index-max-in-pandas-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "How do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\nHow do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "How do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\nHow do we speed up a critical missing operation in pandas, the cumulative index max, and what does this tell us about the compromises and considerations we must bring to optimizing our code?\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1832, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json index 5af8bc718..f5a7dcb21 100644 --- a/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json +++ b/pydata-amsterdam-2023/videos/jordi-smit-llm-agents-101-how-i-gave-chatgpt-access-to-my-to-do-list-pydata-amsterdam.json @@ -1,5 +1,5 @@ { - "description": "ChatGPT is a fantastic assistant, but it cannot do everything yet. For example, it cannot automatically manage my calendar, update my to-do list, or do anything that requires it to perform actions. However, what would it take to make this a reality? I decided to put it to the test by allowing ChatGPT to manage my to-do list for me.\n\nDuring this presentation, I will tell how I gave ChatGPT access to my to-do list. Along the way, I will introduce you to the concepts behind LLM-based agents and how they work. Of course, I will also give a demo of the final result. After this demo, we will dive into clever engineering solutions and tricks I discovered to solve problems such as handling hallucinations, parsing actions, etc.\n\nThis talk is for people who want to learn how to build their first LLM-based agent. Familiarity with Python, PyDantic, and LMMs is nice during this presentation but not essential. As long as you love overengineered solutions to a basic to-do list, you will like this presentation.\n\nDuring the presentation, we will discuss things such as:\n- How to give ChatGPT access to your ToDo(ist) list?\n- What are LLM agents?\n- What is the REACT framework?\n- A demo of the agent I built to manage my to-do list.\n- Implementation tips and tricks to make the agent work better.\n\nThe repo can be found here:\ngithub.com/j0rd1smit/todoist_react_agent\n\nBio:\nJordi Smit\nHi! My name is Jordi Smit. I\u2019m deeply passionate about software engineering, data science, and automation. Nothing makes me happier than creating software that helps humans by automating a tedious and manual-intensive part of their job. Therefore, I love discussing data science since this field has opened the door to many new kinds of automation. However, data science solutions often stay stuck at the proof of concept level. To combat this issue, you also need software engineering knowledge. That is why I love the intersection between software engineering, data science, and automation.\n\nI work as a Machine Learning Engineer at Xebia Data in Amsterdam. Here, I help companies to transform their ML-based models into production-ready applications. I love this job because it allows me to explore the intersection between software engineering and data science daily.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "ChatGPT is a fantastic assistant, but it cannot do everything yet. For example, it cannot automatically manage my calendar, update my to-do list, or do anything that requires it to perform actions. However, what would it take to make this a reality? I decided to put it to the test by allowing ChatGPT to manage my to-do list for me.\n\nDuring this presentation, I will tell how I gave ChatGPT access to my to-do list. Along the way, I will introduce you to the concepts behind LLM-based agents and how they work. Of course, I will also give a demo of the final result. After this demo, we will dive into clever engineering solutions and tricks I discovered to solve problems such as handling hallucinations, parsing actions, etc.\n\nThis talk is for people who want to learn how to build their first LLM-based agent. Familiarity with Python, PyDantic, and LMMs is nice during this presentation but not essential. As long as you love overengineered solutions to a basic to-do list, you will like this presentation.\n\nDuring the presentation, we will discuss things such as:\n- How to give ChatGPT access to your ToDo(ist) list?\n- What are LLM agents?\n- What is the REACT framework?\n- A demo of the agent I built to manage my to-do list.\n- Implementation tips and tricks to make the agent work better.\n\nThe repo can be found here:\ngithub.com/j0rd1smit/todoist_react_agent\n\nBio:\nJordi Smit\nHi! My name is Jordi Smit. I\u2019m deeply passionate about software engineering, data science, and automation. Nothing makes me happier than creating software that helps humans by automating a tedious and manual-intensive part of their job. Therefore, I love discussing data science since this field has opened the door to many new kinds of automation. However, data science solutions often stay stuck at the proof of concept level. To combat this issue, you also need software engineering knowledge. That is why I love the intersection between software engineering, data science, and automation.\n\nI work as a Machine Learning Engineer at Xebia Data in Amsterdam. Here, I help companies to transform their ML-based models into production-ready applications. I love this job because it allows me to explore the intersection between software engineering and data science daily.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1307, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json index 62b81f567..edd349c92 100644 --- a/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-encrypted-computation-what-if-decryption-wasn-t-needed-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "If you are curious about the field of cryptography and what it has to offer data science and machine learning, this talk is for you! We'll dive into the field of encrypted computation, where decryption isn't needed in order to perform calculations, transformations and operations on the data. You'll learn some of the core mathematical theory behind why and how this works, as well as the differences between approaches like homomorphic encryption and secure multi-party computation. At the end, you'll get some pointers and open-source library hints on where to go next and how to start using encrypted computation for problems you are solving the hard way (or not solving at all).\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "If you are curious about the field of cryptography and what it has to offer data science and machine learning, this talk is for you! We'll dive into the field of encrypted computation, where decryption isn't needed in order to perform calculations, transformations and operations on the data. You'll learn some of the core mathematical theory behind why and how this works, as well as the differences between approaches like homomorphic encryption and secure multi-party computation. At the end, you'll get some pointers and open-source library hints on where to go next and how to start using encrypted computation for problems you are solving the hard way (or not solving at all).\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1708, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json index 39f1135cd..635dea83a 100644 --- a/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/katharine-jarmul-keynote-ai-without-dystopia-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Many of us have heard terms like Data for Good, Ethical Machine Learning, Human-Centric Product Design, but those words also bring forward questions -- if we need \"Ethical ML\" what is the rest of machine learning? The current conversation around AI Doom paints a picture where AI goes hand-in-hand with dystopian outcomes. In this keynote, we'll explore what AI could look like if at the core, it was led by these ideals. What if distributed, communal machine learning were a central focus? What if privacy and user choice were a part of our everyday machine learning frameworks? What if aid organizations, governments, coalitions helped shape the problems for AI research? Let's ponder these questions and their outcomes together, imagining AI without the potential for dystopia.\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Many of us have heard terms like Data for Good, Ethical Machine Learning, Human-Centric Product Design, but those words also bring forward questions -- if we need \"Ethical ML\" what is the rest of machine learning? The current conversation around AI Doom paints a picture where AI goes hand-in-hand with dystopian outcomes. In this keynote, we'll explore what AI could look like if at the core, it was led by these ideals. What if distributed, communal machine learning were a central focus? What if privacy and user choice were a part of our everyday machine learning frameworks? What if aid organizations, governments, coalitions helped shape the problems for AI research? Let's ponder these questions and their outcomes together, imagining AI without the potential for dystopia.\n\nBio:\nKatharine Jarmul\nKatharine Jarmul is a privacy activist and data scientist whose work and research focuses on privacy and security in data science workflows. She recently authored Practical Data Privacy for O'Reilly and works as a Principal Data Scientist at Thoughtworks. Katharine has held numerous leadership and independent contributor roles at large companies and startups in the US and Germany -- implementing data processing and machine learning systems with privacy and security built in and developing forward-looking, privacy-first data strategy.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 2149, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json index 36576095f..71d3fa911 100644 --- a/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/kevin-klein-causal-inference-libraries-what-they-do-what-i-d-like-them-to-do-pd-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "This talk will explore the Python tooling and ecosystem for estimating conditional average treatment effects (CATEs) in a Causal Inference setting. Using real world-examples, it will compare and contrast the pros and cons of various existing libraries as well as outline desirable functionalities not currently offered by any public library.\n\nConditional average treatment effects (CATEs) are a fundamental concept in Causal Inference, allowing for the estimation of the effect of a particular treatment or intervention. For CATEs, the effect estimation is not only with respect to an entire population, e.g. all experiment participants, but rather with respect to units, e.g. a single experiment participant, with individual characteristics. This can be very important to meaningfully personalize services and products. In this talk, we will explore the Python tooling and ecosystem for estimating CATEs, including libraries such as EconML and CausalML.\n\nWe will begin by providing an overview of the theory behind CATE estimation, how it fits into the broader field of causal inference and how Machine Learning has recently broken into CATE estimation. We will then dive into the various libraries available for Python, discussing their strengths and weaknesses and providing real-world examples of their usage.\n\nSpecifically, we will cover:\n- EconML: An open-source library for general Causal Inference purposes, by Microsoft Research\n- CausalML: An open-source library for uplift modeling in particular, by Uber\n\nWe will compare and contrast these libraries with respect to CATE estimation, discussing which methods they use, which assumptions they make, and which types of data they are best suited for. We will also provide code examples to illustrate how to use each library in practice. Moreover, we will discuss what we think is missing from both of them.\n\nBy the end of the talk, attendees will have a solid understanding of the Python tooling and ecosystem for estimating CATEs in a causal inference setting. They will know which libraries to use for different types of data and which methods are most appropriate for different scenarios.\n\nThis talk could be particularly relevant for Data Scientists wishing to analyze experiments, such as A/B tests, or trying to derive causal statements from observational, non-experimental data. Participants are not expected to have Causal Inference expertise. Yet, a fundamental understanding of Machine Learning and Probability Theory will be beneficial.\n\n0-5\u2019: Why Causal Inference and why CATE estimation?\n5-10\u2019: What are some conceptual ways of estimating CATEs?\n10-20\u2019: How can we use EconML and CausalML for CATE estimation on a real dataset?\n20-30\u2019: What are we missing from EconML and CausalML?\n\nBio:\nKevin Klein\nKevin is a Data Scientist at QuantCo, working on fraud detection, risk modelling and experimentation. Prior to joining QuantCo, he focused on Natural Language Processing, discrete optimization and Bayesian optimization during his Computer Science major at ETH, Zurich.\nHe's not very original in that he likes functional programming, running and writing.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "This talk will explore the Python tooling and ecosystem for estimating conditional average treatment effects (CATEs) in a Causal Inference setting. Using real world-examples, it will compare and contrast the pros and cons of various existing libraries as well as outline desirable functionalities not currently offered by any public library.\n\nConditional average treatment effects (CATEs) are a fundamental concept in Causal Inference, allowing for the estimation of the effect of a particular treatment or intervention. For CATEs, the effect estimation is not only with respect to an entire population, e.g. all experiment participants, but rather with respect to units, e.g. a single experiment participant, with individual characteristics. This can be very important to meaningfully personalize services and products. In this talk, we will explore the Python tooling and ecosystem for estimating CATEs, including libraries such as EconML and CausalML.\n\nWe will begin by providing an overview of the theory behind CATE estimation, how it fits into the broader field of causal inference and how Machine Learning has recently broken into CATE estimation. We will then dive into the various libraries available for Python, discussing their strengths and weaknesses and providing real-world examples of their usage.\n\nSpecifically, we will cover:\n- EconML: An open-source library for general Causal Inference purposes, by Microsoft Research\n- CausalML: An open-source library for uplift modeling in particular, by Uber\n\nWe will compare and contrast these libraries with respect to CATE estimation, discussing which methods they use, which assumptions they make, and which types of data they are best suited for. We will also provide code examples to illustrate how to use each library in practice. Moreover, we will discuss what we think is missing from both of them.\n\nBy the end of the talk, attendees will have a solid understanding of the Python tooling and ecosystem for estimating CATEs in a causal inference setting. They will know which libraries to use for different types of data and which methods are most appropriate for different scenarios.\n\nThis talk could be particularly relevant for Data Scientists wishing to analyze experiments, such as A/B tests, or trying to derive causal statements from observational, non-experimental data. Participants are not expected to have Causal Inference expertise. Yet, a fundamental understanding of Machine Learning and Probability Theory will be beneficial.\n\n0-5\u2019: Why Causal Inference and why CATE estimation?\n5-10\u2019: What are some conceptual ways of estimating CATEs?\n10-20\u2019: How can we use EconML and CausalML for CATE estimation on a real dataset?\n20-30\u2019: What are we missing from EconML and CausalML?\n\nBio:\nKevin Klein\nKevin is a Data Scientist at QuantCo, working on fraud detection, risk modelling and experimentation. Prior to joining QuantCo, he focused on Natural Language Processing, discrete optimization and Bayesian optimization during his Computer Science major at ETH, Zurich.\nHe's not very original in that he likes functional programming, running and writing.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1340, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json index 5c7a0d207..b16086665 100644 --- a/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/krishi-sharma-innovation-in-the-age-of-regulation-federated-learning-with-flower-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "With the rise of data privacy concerns around AI in the EU, how can we innovate using AI capabilities despite regulations around consumer data? What tools and features are available to help us build AI in regulated industries? This talk will discuss how we can leverage diverse datasets to build better AI models without ever having to touch the datasets by using a Python library called Flower.\n\nIn this talk, we\u2019ll review the importance of data privacy concerns, particularly in the EU, and address how we can build AI using sensitive data. We'll discuss a few machine learning techniques (classical, distributed and federated learning), and show how federated learning can help us train AI models without ever touching the sensitive data.\n\nThen, we'll evaluate a few main open source Python packages that help engineers get started with federated learning and why Flower is a valuable option to consider for your next project. We'll review the core features of Flower; most notably, it's ease of use.\n\nAfter that, we\u2019ll jump into a demo and show how, with minimal code, a Python engineer can orchestrate a training job with multiple data sources using federated learning. We\u2019ll walk through different parameters that give engineers the power to control and fine tune the server without the hassle of knowing infrastructure or cloud architecture.\n\nBy the end of this talk, you\u2019ll be able to:\n\nUnderstand the role of federated learning in a landscape with increasing regulation around AI, particularly in the EU with the proposed Artificial Intelligence Act\nDifferentiate between federated learning and classical machine learning\nDesign your project so that it is in compliance with current and future legislation passed on how to use personal data\nBuild and fine tune a server that hosts the model weights for a model trained without seeing personal data\nUnderstand options available to increase the privacy around the data that is used to train the model\nThere will be a link to a Github repo at the end of the talk that contains all the code used in the demo in order to help you get started with your first federated learning project.\n\nBio:\nKrishi Sharma\nKrishi Sharma is a software developer at KUNGFU.AI where she builds software applications that power machine learning models and deliver data for a broad range of services. As a former data scientist and machine learning engineer, she is passionate about building tools that ease the infrastructure dependencies and reduce potential technical debt around handling data. She helped build and maintains an internal Python tool, Potluck, which allows machine learning engineers the ability to bootstrap a containerized, production ready application with data pipelining templates so that her team can focus on the data and metrics without squandering too much time finagling with deployment and software\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "With the rise of data privacy concerns around AI in the EU, how can we innovate using AI capabilities despite regulations around consumer data? What tools and features are available to help us build AI in regulated industries? This talk will discuss how we can leverage diverse datasets to build better AI models without ever having to touch the datasets by using a Python library called Flower.\n\nIn this talk, we\u2019ll review the importance of data privacy concerns, particularly in the EU, and address how we can build AI using sensitive data. We'll discuss a few machine learning techniques (classical, distributed and federated learning), and show how federated learning can help us train AI models without ever touching the sensitive data.\n\nThen, we'll evaluate a few main open source Python packages that help engineers get started with federated learning and why Flower is a valuable option to consider for your next project. We'll review the core features of Flower; most notably, it's ease of use.\n\nAfter that, we\u2019ll jump into a demo and show how, with minimal code, a Python engineer can orchestrate a training job with multiple data sources using federated learning. We\u2019ll walk through different parameters that give engineers the power to control and fine tune the server without the hassle of knowing infrastructure or cloud architecture.\n\nBy the end of this talk, you\u2019ll be able to:\n\nUnderstand the role of federated learning in a landscape with increasing regulation around AI, particularly in the EU with the proposed Artificial Intelligence Act\nDifferentiate between federated learning and classical machine learning\nDesign your project so that it is in compliance with current and future legislation passed on how to use personal data\nBuild and fine tune a server that hosts the model weights for a model trained without seeing personal data\nUnderstand options available to increase the privacy around the data that is used to train the model\nThere will be a link to a Github repo at the end of the talk that contains all the code used in the demo in order to help you get started with your first federated learning project.\n\nBio:\nKrishi Sharma\nKrishi Sharma is a software developer at KUNGFU.AI where she builds software applications that power machine learning models and deliver data for a broad range of services. As a former data scientist and machine learning engineer, she is passionate about building tools that ease the infrastructure dependencies and reduce potential technical debt around handling data. She helped build and maintains an internal Python tool, Potluck, which allows machine learning engineers the ability to bootstrap a containerized, production ready application with data pipelining templates so that her team can focus on the data and metrics without squandering too much time finagling with deployment and software\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1487, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json index fb922b8e5..ced96a820 100644 --- a/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/laura-summers-ok-doomer-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "AI won't end the world, but it can and is making life miserable for plenty of folks. Instead of engaging with the AI overlords, let's explore a pragmatic set of design choices that all Data Scientists and ML devs can implement right now, to reduce the risks of deploying AI systems in the real world.\n\nLeave the AI boomers to grumble amongst themselves about x-risk and the singularity. Instead let's focus-in on how we can alleviate the real-world harms happening right now.\n\nToo often attempts to identify risks and respond to failure modes of ML and automated systems dive straight into the specifics of model, stack, and implementation. Or worse, add further impenetrable layers of abstraction - the \"more models, more problems\" syndrome. While it's encouraging to see the ecosystem of explainability tools and ML ops surging, as developers and pragmatists we should always prefer the simplest and cheapest tool in our toolkit which is fit for purpose.\n\nThis talk calls attention to a number of existing simple, cheap and effective levers for flagging and reducing risk that are often overlooked.\n\nThese are software design fundamentals like timely and contextual feedback loops, or graceful degradation, that are easily forgotten in the rush to market. These pragmatic tools and product design choices can immediately improve visibility, safety and reduce reputational risk for any team implementing AI.\n\nP.S. Better oversight and tooling for our current tech will, by definition, improve our chances of being alerted if an existentially risky intelligence did happen to emerge from the silicon ether, one day. So it's a win win, really. \ud83e\udd37\u200d\u2640\ufe0f\n\nBio:\nLaura Summers\nLaura is a Design Engineer and Prodigy Teams Product Lead at Explosion AI.\n\nShe is the founder of Debias AI, (debias.ai) and the human behind Sweet Summer Child Score (summerchild.dev), Ethics Litmus Tests (ethical-litmus.site), fairXiv (fairxiv.org), the Melbourne Fair ML reading group (groups.io/g/fair-ml). Laura is passionate about feminism, digital rights and designing for privacy. She speaks, writes and runs workshops at the intersection of design and technology.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "AI won't end the world, but it can and is making life miserable for plenty of folks. Instead of engaging with the AI overlords, let's explore a pragmatic set of design choices that all Data Scientists and ML devs can implement right now, to reduce the risks of deploying AI systems in the real world.\n\nLeave the AI boomers to grumble amongst themselves about x-risk and the singularity. Instead let's focus-in on how we can alleviate the real-world harms happening right now.\n\nToo often attempts to identify risks and respond to failure modes of ML and automated systems dive straight into the specifics of model, stack, and implementation. Or worse, add further impenetrable layers of abstraction - the \"more models, more problems\" syndrome. While it's encouraging to see the ecosystem of explainability tools and ML ops surging, as developers and pragmatists we should always prefer the simplest and cheapest tool in our toolkit which is fit for purpose.\n\nThis talk calls attention to a number of existing simple, cheap and effective levers for flagging and reducing risk that are often overlooked.\n\nThese are software design fundamentals like timely and contextual feedback loops, or graceful degradation, that are easily forgotten in the rush to market. These pragmatic tools and product design choices can immediately improve visibility, safety and reduce reputational risk for any team implementing AI.\n\nP.S. Better oversight and tooling for our current tech will, by definition, improve our chances of being alerted if an existentially risky intelligence did happen to emerge from the silicon ether, one day. So it's a win win, really. \ud83e\udd37\u200d\u2640\ufe0f\n\nBio:\nLaura Summers\nLaura is a Design Engineer and Prodigy Teams Product Lead at Explosion AI.\n\nShe is the founder of Debias AI, (debias.ai) and the human behind Sweet Summer Child Score (summerchild.dev), Ethics Litmus Tests (ethical-litmus.site), fairXiv (fairxiv.org), the Melbourne Fair ML reading group (groups.io/g/fair-ml). Laura is passionate about feminism, digital rights and designing for privacy. She speaks, writes and runs workshops at the intersection of design and technology.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1685, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json index e30538984..92722ad06 100644 --- a/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/lets-do-the-time-warp-again-time-series-machine-learning-with-distance-functions-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Many algorithms for machine learning from time series are based on measuring the distance or similarity between series. The most popular distance measure is dynamic time warping, which attempts to optimally realign two series to compensate for offest. There are many others though. We present an overview of the most popular time series specific distance functions and describe their speed optimised implementations in aeon, a scikit-learn compatible time series machine learning toolkit. We demonstrate their application for clustering, classification and regression on a real world case study and highlight some of the latest distance based time series machine learning tools available in aeon.\n\nThis talk introduces you to popular time series distance functions and demonstrates their usage in exploratory and predictive modelling of time series. Participants will come away with an idea of how to use the very latest research into time series distances for clustering, classification and regression using the aeon toolkit and scikit learn. The talk will be mostly practical and code based, with some algorithmic and mathematical notation.\n\nDistances are used in all forms of time series machine learning. They can help explore collections of time series through clustering, reduce dimensionality by averaging and be used with instance based or kernel based classifiers and regressors. They are used in streaming based anomaly detection and change point detection and have been embedded within tree based ensembles for classification.\n\nThe basic problem in specifying a distance function is to quantify how dissimilar two series are. Elastic distances attempt to compensate for small mis-alignments caused by offset that would make similar series look very different to measures such as Euclidean distance or correlation. There have been many different algorithms that combine forms of time warping (stretching the indexes to realign series) and editing (removing time points from one of the series to improve alignment). In the first part of the talk we will provide a high level overview and visualisation of the differences between these algorithms before describing the aeon toolkit, which contains the most comprehensive and fastest library of elastic distances that we are aware of. aeon distances can be used directly with sklearn distance based algorithms and with the many time series specific algorithms for classification, clustering and regression available in aeon. In the the middle section of the tutorial we will use a real world industrial dataset to demonstrate use cases in clustering, classification and regression. We will end with some pointers to the very latest research into using distance functions. We will require attendees to have a basic knowledge of scikit-learn and standard machine learning algorithms.\n\nThis should appeal to anyone interested in machine learning from time series. It will focus on practical application and algorithm comprehension rather than maths, and will identify the very latest research into algorithm development to suggest further reading. We will provide easy to follow notbooks prior to the talk and all examples will be freely available.\n\nBio:\nTony Bagnall\nTony is a Professor of Computer Science at the University of East Anglia, where he leads the time series machine learning group. His primary research interest is in time series machine learning, with a historic focus on classification, but more recently looking at clustering and regression. He has a side interest in ensemble design.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Many algorithms for machine learning from time series are based on measuring the distance or similarity between series. The most popular distance measure is dynamic time warping, which attempts to optimally realign two series to compensate for offest. There are many others though. We present an overview of the most popular time series specific distance functions and describe their speed optimised implementations in aeon, a scikit-learn compatible time series machine learning toolkit. We demonstrate their application for clustering, classification and regression on a real world case study and highlight some of the latest distance based time series machine learning tools available in aeon.\n\nThis talk introduces you to popular time series distance functions and demonstrates their usage in exploratory and predictive modelling of time series. Participants will come away with an idea of how to use the very latest research into time series distances for clustering, classification and regression using the aeon toolkit and scikit learn. The talk will be mostly practical and code based, with some algorithmic and mathematical notation.\n\nDistances are used in all forms of time series machine learning. They can help explore collections of time series through clustering, reduce dimensionality by averaging and be used with instance based or kernel based classifiers and regressors. They are used in streaming based anomaly detection and change point detection and have been embedded within tree based ensembles for classification.\n\nThe basic problem in specifying a distance function is to quantify how dissimilar two series are. Elastic distances attempt to compensate for small mis-alignments caused by offset that would make similar series look very different to measures such as Euclidean distance or correlation. There have been many different algorithms that combine forms of time warping (stretching the indexes to realign series) and editing (removing time points from one of the series to improve alignment). In the first part of the talk we will provide a high level overview and visualisation of the differences between these algorithms before describing the aeon toolkit, which contains the most comprehensive and fastest library of elastic distances that we are aware of. aeon distances can be used directly with sklearn distance based algorithms and with the many time series specific algorithms for classification, clustering and regression available in aeon. In the the middle section of the tutorial we will use a real world industrial dataset to demonstrate use cases in clustering, classification and regression. We will end with some pointers to the very latest research into using distance functions. We will require attendees to have a basic knowledge of scikit-learn and standard machine learning algorithms.\n\nThis should appeal to anyone interested in machine learning from time series. It will focus on practical application and algorithm comprehension rather than maths, and will identify the very latest research into algorithm development to suggest further reading. We will provide easy to follow notbooks prior to the talk and all examples will be freely available.\n\nBio:\nTony Bagnall\nTony is a Professor of Computer Science at the University of East Anglia, where he leads the time series machine learning group. His primary research interest is in time series machine learning, with a historic focus on classification, but more recently looking at clustering and regression. He has a side interest in ensemble design.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1484, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json index d36ec9463..4a98d3c4e 100644 --- a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "In the Netherlands a large share of energy is used by industry. By measuring the energy usage of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste. It turns out that in most factories, the biggest source of energy waste comes from idling machines. To be able to give valuable insights and provide relevant alerts to our customers, we set up a machine learning system for standby detection with a \u201chuman in the loop\u201d. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem. No background knowledge is required for this talk.\n\nIn the Netherlands a large share of energy is used by industry (less than 40% compared to only 14% used by households*). Eliminating energy waste in this sector is a big step forward towards a greener future. Therefore, Sensorfact made it its mission to eliminate all industrial energy waste. By measuring the energy usage (electricity or gas) of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste.\n\nIt turns out that in most factories, the biggest source of energy waste comes from forgetting to turn off machines when they are not used. Flagging idling machines based on their electricity usage may seem like a trivial problem at first, however the large variety in machines and production processes makes this a lot harder than you would expect. To be able to give valuable insights on idling machines and provide relevant alerts to our customers, we set up a machine learning system with a \u201chuman in the loop\u201d.\n\nIn many settings it is perfectly fine to embed a machine learning model in a process without any human interference. However, there are cases where it is better to keep a human in the loop. The most obvious use cases are those where there is simply no room for error, for example in medical applications. However, also in less life threatening it can be beneficial to have a human act as gatekeeper ensuring high quality outputs. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem, using the case of standby detection. We will share learnings from our own experience and along the way give you an overview of the (open source) tools we chose to use for the different facets of the project.\n\nNo background knowledge is required for this talk. If you are looking for inspiration on how to build a machine learning system with a human in the loop or if you are curious about sustainability use cases this talk may be interesting for you.\n\n*https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector\n\nBio:\nLieke Kools\nLieke is lead data scientist at Sensorfact, a company aiming to eliminate all industrial energy waste for SME\u2019s. In her role she focusses on the data fueled products that help their consultants to efficiently and effectively give advice to customers. Before joining Sensorfact she worked as a data science consultant at Vantage AI and completed a PhD in econometrics.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In the Netherlands a large share of energy is used by industry. By measuring the energy usage of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste. It turns out that in most factories, the biggest source of energy waste comes from idling machines. To be able to give valuable insights and provide relevant alerts to our customers, we set up a machine learning system for standby detection with a \u201chuman in the loop\u201d. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem. No background knowledge is required for this talk.\n\nIn the Netherlands a large share of energy is used by industry (less than 40% compared to only 14% used by households*). Eliminating energy waste in this sector is a big step forward towards a greener future. Therefore, Sensorfact made it its mission to eliminate all industrial energy waste. By measuring the energy usage (electricity or gas) of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste.\n\nIt turns out that in most factories, the biggest source of energy waste comes from forgetting to turn off machines when they are not used. Flagging idling machines based on their electricity usage may seem like a trivial problem at first, however the large variety in machines and production processes makes this a lot harder than you would expect. To be able to give valuable insights on idling machines and provide relevant alerts to our customers, we set up a machine learning system with a \u201chuman in the loop\u201d.\n\nIn many settings it is perfectly fine to embed a machine learning model in a process without any human interference. However, there are cases where it is better to keep a human in the loop. The most obvious use cases are those where there is simply no room for error, for example in medical applications. However, also in less life threatening it can be beneficial to have a human act as gatekeeper ensuring high quality outputs. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem, using the case of standby detection. We will share learnings from our own experience and along the way give you an overview of the (open source) tools we chose to use for the different facets of the project.\n\nNo background knowledge is required for this talk. If you are looking for inspiration on how to build a machine learning system with a human in the loop or if you are curious about sustainability use cases this talk may be interesting for you.\n\n*https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector\n\nBio:\nLieke Kools\nLieke is lead data scientist at Sensorfact, a company aiming to eliminate all industrial energy waste for SME\u2019s. In her role she focusses on the data fueled products that help their consultants to efficiently and effectively give advice to customers. Before joining Sensorfact she worked as a data science consultant at Vantage AI and completed a PhD in econometrics.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1220, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json index a48a583d5..2500fe514 100644 --- a/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json +++ b/pydata-amsterdam-2023/videos/mael-deschamps-our-journey-using-data-and-ai-to-help-monitor-wildlife-in-parks-in-africa.json @@ -1,5 +1,5 @@ { - "description": "Exploration of the intersection between data, AI, and environmental conservation. In this talk, we will share our experiences and practical insights during our journey trying to develop a system using Python, camera traps and data-driven techniques to help detect poachers in Africa.\n\nIn this storytelling and informative talk, we will delve into our experience of data and AI to monitor wildlife in parks in Africa. Our objective is to provide attendees with a comprehensive understanding of the applications, challenges, and opportunities of leveraging data-driven techniques in environmental conservation.\n\nAudience : individuals interested in leveraging data for positive impact.\n\nThe talk is accessible to a non-technical audience in its story-telling part, but also contains technical parts and details, as well as a live demonstration of the developed and open-sourced solution. Knowledge of Python and cloud infrastructures may be useful.\nTechnologies explored : Python, Node-RED, Streamlit, Google Cloud Platform, Google Vision API, Zamba, Earth Rangers.\n\nBios:\nMa\u00ebl Deschamps\nManager Machine Learning Engineer, I lead the MLOps Expertise in a team of 20+ Data Engineers & Data Scientist. During my time between Shanghai and Amsterdam I explored 15+ project for 10+ clients working in various industries.\nI find great joy in making both my teams and clients happy. I believe in management through empathy and transparency and I'm passionate about Data Sustainability and all its related technical challenges.\nFeel free to reach-out to discuss any of those topics.\n\nSimone Gayed Said\nHello Hello! \ud83c\udf1f I'm Simone, I work as a Machine Learning engineer, and I'm all about using my skills to make a positive impact on the World! \ud83d\ude80\u2728\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Exploration of the intersection between data, AI, and environmental conservation. In this talk, we will share our experiences and practical insights during our journey trying to develop a system using Python, camera traps and data-driven techniques to help detect poachers in Africa.\n\nIn this storytelling and informative talk, we will delve into our experience of data and AI to monitor wildlife in parks in Africa. Our objective is to provide attendees with a comprehensive understanding of the applications, challenges, and opportunities of leveraging data-driven techniques in environmental conservation.\n\nAudience : individuals interested in leveraging data for positive impact.\n\nThe talk is accessible to a non-technical audience in its story-telling part, but also contains technical parts and details, as well as a live demonstration of the developed and open-sourced solution. Knowledge of Python and cloud infrastructures may be useful.\nTechnologies explored : Python, Node-RED, Streamlit, Google Cloud Platform, Google Vision API, Zamba, Earth Rangers.\n\nBios:\nMa\u00ebl Deschamps\nManager Machine Learning Engineer, I lead the MLOps Expertise in a team of 20+ Data Engineers & Data Scientist. During my time between Shanghai and Amsterdam I explored 15+ project for 10+ clients working in various industries.\nI find great joy in making both my teams and clients happy. I believe in management through empathy and transparency and I'm passionate about Data Sustainability and all its related technical challenges.\nFeel free to reach-out to discuss any of those topics.\n\nSimone Gayed Said\nHello Hello! \ud83c\udf1f I'm Simone, I work as a Machine Learning engineer, and I'm all about using my skills to make a positive impact on the World! \ud83d\ude80\u2728\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1147, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json index 7f16fea39..87ce49633 100644 --- a/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/maryam-miradi-deep-look-into-deepfakes-mastering-creation-impact-and-detection-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Deepfakes, a form of synthetic media where a person's image or video is seamlessly replaced using Generative AI like GANs, have recieved significant attention. This talk aims to provide a comprehensive exploration of deepfakes, covering their creation process, positive and negative effects, development pace, and tools for detection. By the end of the presentation, attendees will be equipped with how to create and detect deepfakes, a deep understanding of the technology and its impact.\n\nTalk Outline:\n\nI. How Deepfakes Work (Approx. 8 minutes)\n\nStep-by-step explanation of deepfake creation using an opensource tool\nClarifying the technical aspects behind manipulating existing media with AI algorithms\nII. Deepfakes with GANs (Approx. 8 minutes)\n\nIntroduction to Generative Adversarial Networks (GANs) and their role in deepfake generation\nDifferent types of GANs and how to craft realistic deepfakes\nIII. The Good and the Bad (Approx. 8 minutes)\n\nExploring the positive effects of deepfakes\nUnveiling the negative implications of deepfakes\nReal-world examples highlighting the ethical concerns\nSpeculating on the future developments of deepfake technology\nIV. How to Recognize Deepfakes (Approx. 6 minutes)\n\nInsight into the ongoing efforts to combat the misuse of deepfakes\nVarious approaches and AI-driven tools for detecting deepfake media\nUnderstanding the limitations in detecting increasingly sophisticated deepfakes\nKey Takeaways:\n\nIn-depth understanding of deepfake creation and the role of GANs\nAwareness of the positive and negative impacts of deepfakes in different domains\nReal-world examples illustrating the ethical concerns surrounding deepfakes\nInsights into the future trends and advancements in deepfake technology\nFamiliarity with a range of AI-based approaches and tools for detecting deepfakes\n\nBio:\nMaryam Miradi\nMaryam Miradi is AI and Data Science Lead at Transactie Monitoring Nederland (TMNL). She has a PhD in Artificial Intelligence Deep Learning, specialised in NLP and Computer Vision from Delft University of Technology. The last 15 years, she has developed different AI solutions for Organisations such as Ahold-Delhaize, Belastingdienst, Alliander, Stedin and ABN AMRO\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Deepfakes, a form of synthetic media where a person's image or video is seamlessly replaced using Generative AI like GANs, have recieved significant attention. This talk aims to provide a comprehensive exploration of deepfakes, covering their creation process, positive and negative effects, development pace, and tools for detection. By the end of the presentation, attendees will be equipped with how to create and detect deepfakes, a deep understanding of the technology and its impact.\n\nTalk Outline:\n\nI. How Deepfakes Work (Approx. 8 minutes)\n\nStep-by-step explanation of deepfake creation using an opensource tool\nClarifying the technical aspects behind manipulating existing media with AI algorithms\nII. Deepfakes with GANs (Approx. 8 minutes)\n\nIntroduction to Generative Adversarial Networks (GANs) and their role in deepfake generation\nDifferent types of GANs and how to craft realistic deepfakes\nIII. The Good and the Bad (Approx. 8 minutes)\n\nExploring the positive effects of deepfakes\nUnveiling the negative implications of deepfakes\nReal-world examples highlighting the ethical concerns\nSpeculating on the future developments of deepfake technology\nIV. How to Recognize Deepfakes (Approx. 6 minutes)\n\nInsight into the ongoing efforts to combat the misuse of deepfakes\nVarious approaches and AI-driven tools for detecting deepfake media\nUnderstanding the limitations in detecting increasingly sophisticated deepfakes\nKey Takeaways:\n\nIn-depth understanding of deepfake creation and the role of GANs\nAwareness of the positive and negative impacts of deepfakes in different domains\nReal-world examples illustrating the ethical concerns surrounding deepfakes\nInsights into the future trends and advancements in deepfake technology\nFamiliarity with a range of AI-based approaches and tools for detecting deepfakes\n\nBio:\nMaryam Miradi\nMaryam Miradi is AI and Data Science Lead at Transactie Monitoring Nederland (TMNL). She has a PhD in Artificial Intelligence Deep Learning, specialised in NLP and Computer Vision from Delft University of Technology. The last 15 years, she has developed different AI solutions for Organisations such as Ahold-Delhaize, Belastingdienst, Alliander, Stedin and ABN AMRO\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1677, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json index 530cb3367..de1d8c881 100644 --- a/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json +++ b/pydata-amsterdam-2023/videos/mastering-recommendation-systems-evaluation-an-a-b-testing-approach-with-insights-from-the-industry.json @@ -1,5 +1,5 @@ { - "description": "Recommendation systems shape personalized experiences across various sectors, but evaluating their effectiveness remains a significant challenge. Drawing on experiences from industry leaders such as Booking.com, this talk introduces a robust, practical approach to A/B testing for assessing the quality of recommendation systems. The talk is designed for data scientists, statisticians, and business professionals, offering real-world insights and industry tricks on setting up A/B tests, interpreting results, and circumventing common pitfalls. While basic familiarity with recommendation systems and A/B testing is beneficial, it's not a prerequisite.\n\nThis talk aims to provide attendees with a practical understanding of A/B testing in the evaluation of recommendation systems, including unique insights from industry practices and specific tricks that enhance effectiveness.\n\nMy report includes next steps:\n- Introduction to recommendation systems, their ubiquity, and the imperative for evaluation, including industry examples.\n- An overview of A/B testing and its vital role in assessing recommendation systems, supported by insights from Booking.com and other industry leaders.\n- Techniques for designing effective hypotheses for A/B tests, focusing on recommendation systems.\n- Choosing pertinent metrics for robust evaluation of recommendation systems with industry examples.\n- Conducting A/B tests: industry best practices, common pitfalls, and strategies for mitigation, reinforced by real-world cases.\n- Accurate interpretation of A/B testing results and management of statistical biases, with insights from the field.\nBy the end of the talk, attendees will have a comprehensive understanding of how to apply A/B testing effectively to recommendation systems, select relevant metrics, interpret results accurately, and navigate common challenges, backed by industry best practices and practical examples.\n\nBio:\nIldar Safilo\nMachine Learning Scientist in the Booking.com\nExperienced manager in MLE/DS/SE/DA, I possess extensive expertise in machine learning, analytics, and software engineering. I excel at leading teams to create groundbreaking businesses and delivering innovative solutions for real-world business cases across various industries, including IT, banking, telecommunications, marketplaces, game development, shops, Travel-tech and streaming platforms.\nExpert in building recommendation and ranking systems, as well as personalization automation with machine learning, and advanced A/B testing.\nCo-author and lecturer of a popular online course on recommender system development with over 1000 students.\nCo-author an open-source Python library called RecTools, specifically designed for building recommender systems. The library is hosted on GitHub at RecTools and has received widespread recognition and adoption in the industry.\nGraduate with a Master\u2019s degree in Mathematics and Computer Science and over 6 years of experience in data science.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Recommendation systems shape personalized experiences across various sectors, but evaluating their effectiveness remains a significant challenge. Drawing on experiences from industry leaders such as Booking.com, this talk introduces a robust, practical approach to A/B testing for assessing the quality of recommendation systems. The talk is designed for data scientists, statisticians, and business professionals, offering real-world insights and industry tricks on setting up A/B tests, interpreting results, and circumventing common pitfalls. While basic familiarity with recommendation systems and A/B testing is beneficial, it's not a prerequisite.\n\nThis talk aims to provide attendees with a practical understanding of A/B testing in the evaluation of recommendation systems, including unique insights from industry practices and specific tricks that enhance effectiveness.\n\nMy report includes next steps:\n- Introduction to recommendation systems, their ubiquity, and the imperative for evaluation, including industry examples.\n- An overview of A/B testing and its vital role in assessing recommendation systems, supported by insights from Booking.com and other industry leaders.\n- Techniques for designing effective hypotheses for A/B tests, focusing on recommendation systems.\n- Choosing pertinent metrics for robust evaluation of recommendation systems with industry examples.\n- Conducting A/B tests: industry best practices, common pitfalls, and strategies for mitigation, reinforced by real-world cases.\n- Accurate interpretation of A/B testing results and management of statistical biases, with insights from the field.\nBy the end of the talk, attendees will have a comprehensive understanding of how to apply A/B testing effectively to recommendation systems, select relevant metrics, interpret results accurately, and navigate common challenges, backed by industry best practices and practical examples.\n\nBio:\nIldar Safilo\nMachine Learning Scientist in the Booking.com\nExperienced manager in MLE/DS/SE/DA, I possess extensive expertise in machine learning, analytics, and software engineering. I excel at leading teams to create groundbreaking businesses and delivering innovative solutions for real-world business cases across various industries, including IT, banking, telecommunications, marketplaces, game development, shops, Travel-tech and streaming platforms.\nExpert in building recommendation and ranking systems, as well as personalization automation with machine learning, and advanced A/B testing.\nCo-author and lecturer of a popular online course on recommender system development with over 1000 students.\nCo-author an open-source Python library called RecTools, specifically designed for building recommender systems. The library is hosted on GitHub at RecTools and has received widespread recognition and adoption in the industry.\nGraduate with a Master\u2019s degree in Mathematics and Computer Science and over 6 years of experience in data science.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1090, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json index fc150d57f..f25eed6eb 100644 --- a/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/muhleisen-raasveldt-in-process-analytical-data-management-with-duckdb-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "DuckDB is a novel analytical data management system. DuckDB supports complex queries, has no external dependencies, and is deeply integrated into the Python ecosystem. Because DuckDB runs in the same process, no serialization or socket communication has to occur, making data transfer virtually instantaneous. For example, DuckDB can directly query Pandas data frames faster than Pandas itself. In our talk, we will describe the user values of DuckDB, and how it can be used to improve their day-to-day lives through automatic parallelization, efficient operators and out-of-core operations.\n\nData management systems and data analysts have a troubled relationship: Common systems such as Postgres or Spark are unwieldy, hard to set up and maintain, hard to transfer data in and out, and hard to integrate into complex end-to-end workflows. As a response, analysts have developed their own ecosystem of data wrangling tools such as Pandas or Polars. These tools are much more natural for analysts to use, but are limited in the amount of data they can process or the amount of automatic optimization that is supported.\n\nDuckDB is a new analytical data management system that is built for an in-process use case. DuckDB speaks SQL, has no external dependencies, and is deeply integrated into the Python ecosystem. DuckDB is Free and Open Source software under the MIT license. DuckDB uses state-of-the art query processing techniques with vectorized execution, lightweight compression, and morsel-driven automatic parallelism. DuckDB is out-of-core capable, meaning that it is capable of not only reading datasets that are bigger than main memory. This allows for analysis of far greater datasets and in many cases removes the need to run separate infrastructure.\n\nThe \u201cduckdb\u201d Python package is not a client to the DuckDB system, it provides the entire database engine. DuckDB runs without any external server directly inside the Python process. Once there, DuckDB can run complex SQL queries on data frames in Pandas, Polars or PyArrow formats out-of-the box. DuckDB can also directly ingest files in Parquet, CSV or JSON formats. Because DuckDB runs in the same process, data transfer are virtually instantaneous. Conversely, DuckDB\u2019s query results can be transferred back into data frames very cheaply, allowing direct integration with complex downstream libraries such as PyTorch or TensorFlow.\n\nDuckDB enjoys fast-growing popularity, the Python package alone is currently downloaded around one million times a month. DuckDB has recently become the default backend of the Ibis project that offers a consistent interface in Python over a variety of data backends.\n\nThis talk is aimed at two main groups, data analysts and data engineers. For the analysts, we will explain the user values of DuckDB, and how it can be used to improve their day-to-day lives. For data engineers, we will describe DuckDB\u2019s capabilities to become part of large automated data pipelines. The presenters for the proposed talk, Hannes M\u00fchleisen and Mark Raasveldt are the original creators of DuckDB, they are still leading the project and are deeply familiar with its Python integration.\n\nBios:\nHannes M\u00fchleisen\nProf. Dr. Hannes M\u00fchleisen is a creator of the DuckDB database management system and Co-founder and CEO of DuckDB Labs, a consulting company providing services around DuckDB. He is also a senior researcher of the Database Architectures group at the Centrum Wiskunde & Informatica (CWI), the Dutch national research lab for Mathematics and Computer Science in Amsterdam. Hannes is also Professor of Data Engineering at Radboud Universiteit Nijmegen. His' main interest is analytical data management systems.\n\nMark Raasveldt\nCTO at DuckDB Labs\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "DuckDB is a novel analytical data management system. DuckDB supports complex queries, has no external dependencies, and is deeply integrated into the Python ecosystem. Because DuckDB runs in the same process, no serialization or socket communication has to occur, making data transfer virtually instantaneous. For example, DuckDB can directly query Pandas data frames faster than Pandas itself. In our talk, we will describe the user values of DuckDB, and how it can be used to improve their day-to-day lives through automatic parallelization, efficient operators and out-of-core operations.\n\nData management systems and data analysts have a troubled relationship: Common systems such as Postgres or Spark are unwieldy, hard to set up and maintain, hard to transfer data in and out, and hard to integrate into complex end-to-end workflows. As a response, analysts have developed their own ecosystem of data wrangling tools such as Pandas or Polars. These tools are much more natural for analysts to use, but are limited in the amount of data they can process or the amount of automatic optimization that is supported.\n\nDuckDB is a new analytical data management system that is built for an in-process use case. DuckDB speaks SQL, has no external dependencies, and is deeply integrated into the Python ecosystem. DuckDB is Free and Open Source software under the MIT license. DuckDB uses state-of-the art query processing techniques with vectorized execution, lightweight compression, and morsel-driven automatic parallelism. DuckDB is out-of-core capable, meaning that it is capable of not only reading datasets that are bigger than main memory. This allows for analysis of far greater datasets and in many cases removes the need to run separate infrastructure.\n\nThe \u201cduckdb\u201d Python package is not a client to the DuckDB system, it provides the entire database engine. DuckDB runs without any external server directly inside the Python process. Once there, DuckDB can run complex SQL queries on data frames in Pandas, Polars or PyArrow formats out-of-the box. DuckDB can also directly ingest files in Parquet, CSV or JSON formats. Because DuckDB runs in the same process, data transfer are virtually instantaneous. Conversely, DuckDB\u2019s query results can be transferred back into data frames very cheaply, allowing direct integration with complex downstream libraries such as PyTorch or TensorFlow.\n\nDuckDB enjoys fast-growing popularity, the Python package alone is currently downloaded around one million times a month. DuckDB has recently become the default backend of the Ibis project that offers a consistent interface in Python over a variety of data backends.\n\nThis talk is aimed at two main groups, data analysts and data engineers. For the analysts, we will explain the user values of DuckDB, and how it can be used to improve their day-to-day lives. For data engineers, we will describe DuckDB\u2019s capabilities to become part of large automated data pipelines. The presenters for the proposed talk, Hannes M\u00fchleisen and Mark Raasveldt are the original creators of DuckDB, they are still leading the project and are deeply familiar with its Python integration.\n\nBios:\nHannes M\u00fchleisen\nProf. Dr. Hannes M\u00fchleisen is a creator of the DuckDB database management system and Co-founder and CEO of DuckDB Labs, a consulting company providing services around DuckDB. He is also a senior researcher of the Database Architectures group at the Centrum Wiskunde & Informatica (CWI), the Dutch national research lab for Mathematics and Computer Science in Amsterdam. Hannes is also Professor of Data Engineering at Radboud Universiteit Nijmegen. His' main interest is analytical data management systems.\n\nMark Raasveldt\nCTO at DuckDB Labs\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1392, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json index 9002c44c5..85ae0d46a 100644 --- a/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/nagelkerke-smeets-revealing-the-true-motives-of-news-readers-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Every news consumer has needs and in order to build a true bond with your customer it is vital to meet these, sometimes, diverse needs. To achieve this, first of all, it is important to identify the overarching needs of users; the reason why they read news. The BBC conducted research to determine these needs and identified six distinct categories: Update me, Keep me on trend, Give me perspective, Educate me, Divert me, and Inspire me. Their research showed that an equal distribution of content across these user needs will lead to higher customer engagement and loyalty. To apply this concept within DPG Media, we started building our own user needs model. Through various iterations of text labelling, text preparation, model building, fine-tuning and evaluation, we have arrived at a BERT model that is capable of determining the associated user needs based solely on the article text.\n\nWe would like to take the audience through all the steps that we have taken to get to the point where we are right now. During this process we had to find solutions to many obstacles and we are happy to share these lessons with the audience. Furthermore, we want to discuss all the tools and techniques that we used in order to arrive at the current phase.\n\nThe focus of the talk is on preparing the datasets and building the models, so a background in data science, engineering and/or machine learning is usefull.\n\nThe time breakdown will be the following:\nMinutes 0-5: introducing the topic and explaining why it is important\nMinutes 5-10: discussing the tools that we used and prior decisions we made\nMinutes 10-20: going through the labelling process and different models we build\nMinutes 20-25: sharing results and lessons learnt\nMinutes 25-30: giving insights into next steps and future applications\n\nBios:\nJurriaan Nagelkerke\nData Scientist with 15+ years experience in getting value out of data for various companies in different branches. Love to apply the right ML/ AI techniques to answer business questions and actually make a difference. Aside from hands on consultant i'm also trainer in various ML techniques. Last few years strong focus on textual data / NLP and transformer models / LLMs.\n\nVincent Smeets\nHi, my name is Vincent Smeets. I am one of the data scientists within the Data And Customer Analytics department at DPG Media. I am responsible for generating insights from structured and semi-structured data to support decision making within the B2C Marketing organisation. In my freetime I love skateboarding, tennis and running.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Every news consumer has needs and in order to build a true bond with your customer it is vital to meet these, sometimes, diverse needs. To achieve this, first of all, it is important to identify the overarching needs of users; the reason why they read news. The BBC conducted research to determine these needs and identified six distinct categories: Update me, Keep me on trend, Give me perspective, Educate me, Divert me, and Inspire me. Their research showed that an equal distribution of content across these user needs will lead to higher customer engagement and loyalty. To apply this concept within DPG Media, we started building our own user needs model. Through various iterations of text labelling, text preparation, model building, fine-tuning and evaluation, we have arrived at a BERT model that is capable of determining the associated user needs based solely on the article text.\n\nWe would like to take the audience through all the steps that we have taken to get to the point where we are right now. During this process we had to find solutions to many obstacles and we are happy to share these lessons with the audience. Furthermore, we want to discuss all the tools and techniques that we used in order to arrive at the current phase.\n\nThe focus of the talk is on preparing the datasets and building the models, so a background in data science, engineering and/or machine learning is usefull.\n\nThe time breakdown will be the following:\nMinutes 0-5: introducing the topic and explaining why it is important\nMinutes 5-10: discussing the tools that we used and prior decisions we made\nMinutes 10-20: going through the labelling process and different models we build\nMinutes 20-25: sharing results and lessons learnt\nMinutes 25-30: giving insights into next steps and future applications\n\nBios:\nJurriaan Nagelkerke\nData Scientist with 15+ years experience in getting value out of data for various companies in different branches. Love to apply the right ML/ AI techniques to answer business questions and actually make a difference. Aside from hands on consultant i'm also trainer in various ML techniques. Last few years strong focus on textual data / NLP and transformer models / LLMs.\n\nVincent Smeets\nHi, my name is Vincent Smeets. I am one of the data scientists within the Data And Customer Analytics department at DPG Media. I am responsible for generating insights from structured and semi-structured data to support decision making within the B2C Marketing organisation. In my freetime I love skateboarding, tennis and running.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1335, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json index 8f2ada600..426457c3c 100644 --- a/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/okke-van-der-wal-personalization-at-uber-scale-via-causal-driven-machine-learning-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "In this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nIn this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nBio:\nOkke van der Wal\nLeading the Payments Machine Learning team at Uber working on Anomaly Detection, Personalization & Fraud Detection within the Onboarding and Checkout experiences at Uber using Contextual Bandits, Uplift Modelling & Reinforcement Learning.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nIn this talk, we outline how we introduced causality into our machine learning models within the core checkout and onboarding experiences globally, thereby strongly improving our key business metrics. We discuss case studies, where experimental data were combined with machine learning in order to create value for our users and personalize their experiences, and we share our lessons learned with the goal to inspire attendees to start incorporating causality into their machine learning solutions. Additionally, we explain how the open source Python package developed at Uber, CausalML, can help others in successfully making the transition from correlation-driven machine learning to causal-driven machine learning.\n\nBio:\nOkke van der Wal\nLeading the Payments Machine Learning team at Uber working on Anomaly Detection, Personalization & Fraud Detection within the Onboarding and Checkout experiences at Uber using Contextual Bandits, Uplift Modelling & Reinforcement Learning.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1316, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json index 34d232b39..4cce649de 100644 --- a/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/reliable-and-scalable-ml-serving-best-practices-for-online-model-deployment-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Working on ML serving for couple of years we learned a lot. I would like to share a set of best practices / learnings with the community\n\nAt Adyen we deploy a lot of models for online inference in the payment flow. Working in the MLOps team to streamline this process, I learned a lot about best practices / things to consider before (after) putting a model online. These are small things but they do contribute to a production and reliable setup for online inference. Some examples:\n\nAdding meta data & creating a self contained archive\nSeparating serving sources from training sources\nChoosing the requirements of model\nAdding an example input & output request\nAdding schemas for input and output\nCommon issues when putting models online: memory leaks, concurrency\nWhich server is best? Process based or thread based\nHow different python versions affect inference (execution) time\n\nBio:\nZiad Al Moubayed\nStaff Engineer @ Adyen. I am passionate about high performance distributed systems. Recently I was working on scaling Adyen's Data & ML infrastructure.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Working on ML serving for couple of years we learned a lot. I would like to share a set of best practices / learnings with the community\n\nAt Adyen we deploy a lot of models for online inference in the payment flow. Working in the MLOps team to streamline this process, I learned a lot about best practices / things to consider before (after) putting a model online. These are small things but they do contribute to a production and reliable setup for online inference. Some examples:\n\nAdding meta data & creating a self contained archive\nSeparating serving sources from training sources\nChoosing the requirements of model\nAdding an example input & output request\nAdding schemas for input and output\nCommon issues when putting models online: memory leaks, concurrency\nWhich server is best? Process based or thread based\nHow different python versions affect inference (execution) time\n\nBio:\nZiad Al Moubayed\nStaff Engineer @ Adyen. I am passionate about high performance distributed systems. Recently I was working on scaling Adyen's Data & ML infrastructure.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1436, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json index 7a3fb3d84..0528e6abd 100644 --- a/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/riccardo-amadio-declarative-data-manipulation-pipeline-with-dagster-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Bored of old pipeline orchestrator? Difficult to understand if data is up-to-date? Trouble with development workflow of data pipeline?\nDagster, an open-source tool, offers a unique paradigm that simplifies the orchestration and management of data pipelines.\nBy adopting declarative principles, data engineers and data scientists can build scalable, maintainable, and reliable pipelines effortlessly.\nWe will commence with an introduction to Dagster, covering its fundamental concepts to ensure a comprehensive understanding of the material.\nSubsequently, we will explore practical scenarios and use cases, with also DBT for empower the power of SQL language.\n\nMinutes 0-5: Explain the design pattern problem of actual data pipeline framework.\nMinutes 5-15: Introduction to Dagster and its core concepts.\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, with also DBT, the power of gRPC server.\nMinutes 25-30: Q&A and conclusion.\n\nAre you tired of struggling with outdated pipeline orchestrators? Do you find it challenging to ensure your data is always up-to-date? Are you facing difficulties with the development workflow of your data pipeline?\n\nIn this session, we will introduce Dagster, an open-source tool that revolutionizes the orchestration and management of data pipelines. By embracing declarative principles, data engineers and data scientists can effortlessly build scalable, maintainable, and reliable pipelines.\n\nWe will begin by providing an overview of the design pattern problem that many existing data pipeline frameworks face. Understanding the limitations of these frameworks will set the stage for exploring the transformative capabilities of Dagster\n\nNext, we will delve into the core concepts of Dagster, ensuring a comprehensive understanding of the material. You will learn how Dagster simplifies pipeline development and execution by providing a declarative and intuitive approach. Through practical examples and hands-on demonstrations, we will showcase how you can leverage Dagster to build powerful data pipelines.\n\nBut that's not all! We will also explore the integration of DBT, empowering you to harness the full potential of the SQL language within your data pipelines. You will witness the synergy between Dagster and DBT, unlocking new possibilities for data manipulation and transformation.\n\nBy the end, you'll be equipped with the knowledge and inspiration to elevate your data pipeline workflows to new heights.\n\nOutline:\n\nMinutes 0-5: Understanding the design pattern problem of existing data pipeline frameworks\nMinutes 5-15: Introduction to Dagster and its core concepts\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, including the integration with DBT and the power of gRPC server\nMinutes 25-30: Q&A and conclusion\n\nBio:\nRiccardo Amadio\nSenior Data Engineer at Agile Lab with a background of Data Scientist and Software Engineer.\nWhen I don't work with data pipelines , I juggle between closing some of my 100+ open tabs on the browser and my true passion: collecting stars on GitHub \ud83d\udd2d\ud83c\udf1f. In this treasure trove of more than 2,000 repositories, I am pretty sure I can find any tool to solve a problem, and I can\u2019t wait to share them with you.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Bored of old pipeline orchestrator? Difficult to understand if data is up-to-date? Trouble with development workflow of data pipeline?\nDagster, an open-source tool, offers a unique paradigm that simplifies the orchestration and management of data pipelines.\nBy adopting declarative principles, data engineers and data scientists can build scalable, maintainable, and reliable pipelines effortlessly.\nWe will commence with an introduction to Dagster, covering its fundamental concepts to ensure a comprehensive understanding of the material.\nSubsequently, we will explore practical scenarios and use cases, with also DBT for empower the power of SQL language.\n\nMinutes 0-5: Explain the design pattern problem of actual data pipeline framework.\nMinutes 5-15: Introduction to Dagster and its core concepts.\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, with also DBT, the power of gRPC server.\nMinutes 25-30: Q&A and conclusion.\n\nAre you tired of struggling with outdated pipeline orchestrators? Do you find it challenging to ensure your data is always up-to-date? Are you facing difficulties with the development workflow of your data pipeline?\n\nIn this session, we will introduce Dagster, an open-source tool that revolutionizes the orchestration and management of data pipelines. By embracing declarative principles, data engineers and data scientists can effortlessly build scalable, maintainable, and reliable pipelines.\n\nWe will begin by providing an overview of the design pattern problem that many existing data pipeline frameworks face. Understanding the limitations of these frameworks will set the stage for exploring the transformative capabilities of Dagster\n\nNext, we will delve into the core concepts of Dagster, ensuring a comprehensive understanding of the material. You will learn how Dagster simplifies pipeline development and execution by providing a declarative and intuitive approach. Through practical examples and hands-on demonstrations, we will showcase how you can leverage Dagster to build powerful data pipelines.\n\nBut that's not all! We will also explore the integration of DBT, empowering you to harness the full potential of the SQL language within your data pipelines. You will witness the synergy between Dagster and DBT, unlocking new possibilities for data manipulation and transformation.\n\nBy the end, you'll be equipped with the knowledge and inspiration to elevate your data pipeline workflows to new heights.\n\nOutline:\n\nMinutes 0-5: Understanding the design pattern problem of existing data pipeline frameworks\nMinutes 5-15: Introduction to Dagster and its core concepts\nMinutes 10-25: Practical examples of building declarative data pipelines with Dagster, including the integration with DBT and the power of gRPC server\nMinutes 25-30: Q&A and conclusion\n\nBio:\nRiccardo Amadio\nSenior Data Engineer at Agile Lab with a background of Data Scientist and Software Engineer.\nWhen I don't work with data pipelines , I juggle between closing some of my 100+ open tabs on the browser and my true passion: collecting stars on GitHub \ud83d\udd2d\ud83c\udf1f. In this treasure trove of more than 2,000 repositories, I am pretty sure I can find any tool to solve a problem, and I can\u2019t wait to share them with you.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1300, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json index 37b56ac71..175e63ad2 100644 --- a/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/rik-van-der-vlist-balancing-the-electricity-grid-with-multi-level-forecasting-models-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Join us as we explore the complexities of balancing the electricity grid amidst the rise of renewable energy sources. We\u2019ll discover the challenges in forecasting electricity consumption from diverse industrial resources and the modelling techniques employed by Sympower to achieve accurate forecasts. Gain insights into the trade-offs involved in aggregating data at different hierarchical levels in time series forecasting.\n\nThe shift to renewable energy sources presents a major challenge for the electricity grid: solar and wind facilities are constantly varying in power output, making it harder to keep the supply and demand in balance. This creates a need for demand response: strategic activation or deactivation of large industrial resources to balance the electricity grid. Reliable demand response requires an accurate forecast of industrial electricity consumption, to get a clear understanding of which resources can be controlled at what time.\n\nIn this talk we will discuss the challenges faced when forecasting electricity consumption from industrial resources from different kinds of industries such as furnaces, greenhouses or paper mills. We\u2019ll discuss the different modelling approaches for predicting time series including regression, forecasting and deep learning, and we will discuss the suitability of each in different scenarios. Using the forecasting of electricity consumption of industrial resources as an example, we show how we make our forecasts at Sympower to help balance the electricity grid.\n\nFinally we will discuss a trade-off in forecasting: Trends and seasonality often only emerge at aggregate levels, making forecasting at the aggregate level easier. On the other hand, business often requires precision-level insights. Aggregate data is inherently less noisy since the errors tend to cancel out, but also might fail to capture lower-level details. We will discuss the considerations to make when forecasting at different aggregated levels in time or across groups, and what you could do to forecast consistently across different aggregate levels..\n\nKEY TAKEAWAYS\n- Gain insights into selecting the most suitable modelling technique for your forecasting need\n- Understand the challenges posed by the evolving electricity grid and the significance of demand response\n- Explore the trade-offs involved in aggregating data at different hierarchical or temporal levels in time series forecasting\n\nBio:\nRik van der Vlist\nRik is a machine learning engineer with a strong foundation in electrical engineering and a specialization in leveraging electricity data for smart use cases. With previous experience at Eneco, he has focused on delivering automated home energy insights to large group of customers. Currently, Rik is dedicated to constructing a scalable forecasting model for a sustainable electricity grid, combining his passion for data science and sustainable solutions. He thrives on creating value and generating insights from raw data, demonstrating his proficiency in building robust and scalable data pipelines using Spark and Python.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Join us as we explore the complexities of balancing the electricity grid amidst the rise of renewable energy sources. We\u2019ll discover the challenges in forecasting electricity consumption from diverse industrial resources and the modelling techniques employed by Sympower to achieve accurate forecasts. Gain insights into the trade-offs involved in aggregating data at different hierarchical levels in time series forecasting.\n\nThe shift to renewable energy sources presents a major challenge for the electricity grid: solar and wind facilities are constantly varying in power output, making it harder to keep the supply and demand in balance. This creates a need for demand response: strategic activation or deactivation of large industrial resources to balance the electricity grid. Reliable demand response requires an accurate forecast of industrial electricity consumption, to get a clear understanding of which resources can be controlled at what time.\n\nIn this talk we will discuss the challenges faced when forecasting electricity consumption from industrial resources from different kinds of industries such as furnaces, greenhouses or paper mills. We\u2019ll discuss the different modelling approaches for predicting time series including regression, forecasting and deep learning, and we will discuss the suitability of each in different scenarios. Using the forecasting of electricity consumption of industrial resources as an example, we show how we make our forecasts at Sympower to help balance the electricity grid.\n\nFinally we will discuss a trade-off in forecasting: Trends and seasonality often only emerge at aggregate levels, making forecasting at the aggregate level easier. On the other hand, business often requires precision-level insights. Aggregate data is inherently less noisy since the errors tend to cancel out, but also might fail to capture lower-level details. We will discuss the considerations to make when forecasting at different aggregated levels in time or across groups, and what you could do to forecast consistently across different aggregate levels..\n\nKEY TAKEAWAYS\n- Gain insights into selecting the most suitable modelling technique for your forecasting need\n- Understand the challenges posed by the evolving electricity grid and the significance of demand response\n- Explore the trade-offs involved in aggregating data at different hierarchical or temporal levels in time series forecasting\n\nBio:\nRik van der Vlist\nRik is a machine learning engineer with a strong foundation in electrical engineering and a specialization in leveraging electricity data for smart use cases. With previous experience at Eneco, he has focused on delivering automated home energy insights to large group of customers. Currently, Rik is dedicated to constructing a scalable forecasting model for a sustainable electricity grid, combining his passion for data science and sustainable solutions. He thrives on creating value and generating insights from raw data, demonstrating his proficiency in building robust and scalable data pipelines using Spark and Python.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1472, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json index 533ec3f56..2c3b727d5 100644 --- a/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json +++ b/pydata-amsterdam-2023/videos/rikmanspoel-import-full-focus-as-ff-how-to-reduce-stress-and-pressure-as-a-data-specialist.json @@ -1,5 +1,5 @@ { - "description": "Data science, IT and software development become more and more complex and are subject to increasing requirements and fast-paced business demand. Higher complexity, higher pace and higher quality requirements result in more pressure on our fellow data engineers and data scientists.\n\nMore pressure, but are we resilient enough to withstand that increasing pressure? You have probably already seen its outcome. Unhappiness, stress or even burn-outs of co-workers, instead of creating cool code, great solutions and building a better world using your skills.\n\nHow to change the pressure and stress you perceive as a data scientist, data engineer of ML-engineer? How to ensure that your brain\u2019s frontal lobe returns to a problem solving and decision-making state?\n\nTarget audience\nAll experience levels data engineers, data scientists and analysts. For those who start hitting do\u2019s, don\u2019ts and other hard walls in real life companies and projects. Especially if you experience a drain of energy and focus from those pressure and constrains. Senior or junior, there is much to learn and experience.\n\nTakeaway\nLearn and experience 3 great tools to change your resilience instantly and consistently towards pressure and stress. Not just for yourself, but also be able to see and assist co-workers, family, or other loved ones if they experience stress.\n\nBackground knowledge needed\nNone. Just be sure to bring both your head and body to this workshop to experience how quickly these tools work for you.\n\nTime\n\u2022 0 \u2013 5 Intro and experience tool #1\n\u2022 5 \u2013 15 Control your nervous system and work-related stress\n\u2022 15 \u2013 20 experience tool #2\n\u2022 20 \u2013 25 Your stress and social states (based on polyvagal theory)\n\u2022 25 \u2013 30 experience tool #3\n\nBio:\nMaarten Oude Rikmanspoel\nI love working with both technology and people. Currently working as a freelance data engineer and business intelligence specialist to satisfy the tech part of my heart. Fell in love with Python and the PyData modules in 2017 after unsuccessful relationships with Java and C++ in the past. Applying this in a variety of industries and companies.\n\nIn parallel, I\u2019m creating CalmCode.nl for the past 1,5 years with the aim of guiding software developers, IT- and data specialists towards less stress and burnouts. I\u2019ve seen to many bad examples in the larger companies and multi-nationals where developers almost looked as being oppressed instead of being able to do their work properly and in a nice environment. So the people-oriented part of my heart get\u2019s fuelled when I see people grow and being able to take control of their lives again.\n\n\u201cWe\u2019re all just walking each other home.\u201d Ram Dass\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Data science, IT and software development become more and more complex and are subject to increasing requirements and fast-paced business demand. Higher complexity, higher pace and higher quality requirements result in more pressure on our fellow data engineers and data scientists.\n\nMore pressure, but are we resilient enough to withstand that increasing pressure? You have probably already seen its outcome. Unhappiness, stress or even burn-outs of co-workers, instead of creating cool code, great solutions and building a better world using your skills.\n\nHow to change the pressure and stress you perceive as a data scientist, data engineer of ML-engineer? How to ensure that your brain\u2019s frontal lobe returns to a problem solving and decision-making state?\n\nTarget audience\nAll experience levels data engineers, data scientists and analysts. For those who start hitting do\u2019s, don\u2019ts and other hard walls in real life companies and projects. Especially if you experience a drain of energy and focus from those pressure and constrains. Senior or junior, there is much to learn and experience.\n\nTakeaway\nLearn and experience 3 great tools to change your resilience instantly and consistently towards pressure and stress. Not just for yourself, but also be able to see and assist co-workers, family, or other loved ones if they experience stress.\n\nBackground knowledge needed\nNone. Just be sure to bring both your head and body to this workshop to experience how quickly these tools work for you.\n\nTime\n\u2022 0 \u2013 5 Intro and experience tool #1\n\u2022 5 \u2013 15 Control your nervous system and work-related stress\n\u2022 15 \u2013 20 experience tool #2\n\u2022 20 \u2013 25 Your stress and social states (based on polyvagal theory)\n\u2022 25 \u2013 30 experience tool #3\n\nBio:\nMaarten Oude Rikmanspoel\nI love working with both technology and people. Currently working as a freelance data engineer and business intelligence specialist to satisfy the tech part of my heart. Fell in love with Python and the PyData modules in 2017 after unsuccessful relationships with Java and C++ in the past. Applying this in a variety of industries and companies.\n\nIn parallel, I\u2019m creating CalmCode.nl for the past 1,5 years with the aim of guiding software developers, IT- and data specialists towards less stress and burnouts. I\u2019ve seen to many bad examples in the larger companies and multi-nationals where developers almost looked as being oppressed instead of being able to do their work properly and in a nice environment. So the people-oriented part of my heart get\u2019s fuelled when I see people grow and being able to take control of their lives again.\n\n\u201cWe\u2019re all just walking each other home.\u201d Ram Dass\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1716, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json index 459edd7d8..7792a6cf0 100644 --- a/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/ritchie-vink-polars-and-a-peek-into-the-expression-engine-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "This talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nPolars expressions are a DSL to a very powerful vectorized engine. They make it very easy to write parallel, efficient and readable code.\n\nThis talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nBio:\nRitchie Vink\nRitchie Vink is the author of the Polars query engine/ DataFrame library and the CEO/Co-Founder of Polars the company.\nOriginally he has a background in Civil Engineering, but he switched fields and has most work experience in Machine learning and software development. Though what truly matters in experience is what he did in his side-projects.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "This talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nPolars expressions are a DSL to a very powerful vectorized engine. They make it very easy to write parallel, efficient and readable code.\n\nThis talk we will see why the expression engine in polars is so versatile and fast.\nWe will look at them in the perspective of the optimizer as well as the physical engine.\n\nBio:\nRitchie Vink\nRitchie Vink is the author of the Polars query engine/ DataFrame library and the CEO/Co-Founder of Polars the company.\nOriginally he has a background in Civil Engineering, but he switched fields and has most work experience in Machine learning and software development. Though what truly matters in experience is what he did in his side-projects.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1468, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json index 52d682e9f..c5561badd 100644 --- a/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json +++ b/pydata-amsterdam-2023/videos/robert-erdmann-keynote-python-for-imaging-and-artificial-intelligence-in-cultural-heritage.json @@ -1,5 +1,5 @@ { - "description": "For many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\nFor many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "For many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\nFor many people, a museum is the last place they would expect to find cutting-edge data science, but the world of cultural heritage is full of fascinating challenges for imaging and computation. The availability of high-resolution imaging, high-speed internet, and modern computational tools allows us to image cultural heritage objects in staggering detail and with a wide array of techniques. The result, though, is a data deluge: studying single objects like Rembrandt's Night Watch can generate terabytes of data, and there are millions of objects in the world's museums. \n\nThe huge Python ecosystem enables us to build tools to process, analyze, and visualize these data. Examples include creating the 717 gigapixel (!) image of the Night Watch and reconstructing the painting's long-lost missing pieces using AI; controlling a camera and automated turntable in Jupyter for 3D object photography; revealing hidden watermarks in works on paper using a hybrid physics and deep learning-based ink-removal model; using chemical imaging and convolutional neural networks to see the hidden structure of Rembrandt and Vermeer paintings; and using a webcam or smartphone camera to do real-time similarity search over a database of 2.3 million open-access cultural heritage images at 4 frames per second.\n\nThese and several other live demonstrations show how Python is essential in our work to help the world access, preserve, and understand its cultural heritage.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 2262, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json index f19738b67..c5873dffd 100644 --- a/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/sleijster-achieving-developer-autonomy-on-on-premise-data-clusters-using-kubernetes-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Maintaining on-premise clusters poses quite a few challenges. One of these challenges is achieving developer autonomy, where developers can deploy applications themselves. This talk will cover how we set up Kubernetes to achieve exactly that.\n\nAs your datasets are growing, and you gain more use-cases, so do the number of required tools and applications. Where in the past a data cluster consisted of just HDFS, Spark, Airflow and Postgres, you now need OLAP databases, distributed query engines, parallel-computing for your model training and much more. All of this puts a lot of pressure on the infrastructure team responsible to install & maintain all the tools on your platform. By introducing Kubernetes, we change that responsibility to just maintaining HDFS and Kubernetes, and move the responsibility of maintaining and introducing the data tools to the data (platform) engineers.\n\nIn this talk we will cover how we achieved developer autonomy by touching the following subjects;\n- What is the first step to installing Kubernetes on premise?\n- How do we deploy changes automatically?\n- How do we make an experimentation friendly environment for developers while remaining secure?\n- How do we handle secrets to connect different applications together? A\n- Finally, some lessons learned from the migration process.\n\nBio:\nJorrick Sleijster\nJorrick is a Data Platform Engineer at Adyen. With a background in computer science his focus has been on introducing and maintaining tools on the data platform. On the side Jorrick is an active open-source contributor to pet projects and Apache Airflow. One of the contributions was awarded with PR-of-the-month of the Apache Airflow project.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Maintaining on-premise clusters poses quite a few challenges. One of these challenges is achieving developer autonomy, where developers can deploy applications themselves. This talk will cover how we set up Kubernetes to achieve exactly that.\n\nAs your datasets are growing, and you gain more use-cases, so do the number of required tools and applications. Where in the past a data cluster consisted of just HDFS, Spark, Airflow and Postgres, you now need OLAP databases, distributed query engines, parallel-computing for your model training and much more. All of this puts a lot of pressure on the infrastructure team responsible to install & maintain all the tools on your platform. By introducing Kubernetes, we change that responsibility to just maintaining HDFS and Kubernetes, and move the responsibility of maintaining and introducing the data tools to the data (platform) engineers.\n\nIn this talk we will cover how we achieved developer autonomy by touching the following subjects;\n- What is the first step to installing Kubernetes on premise?\n- How do we deploy changes automatically?\n- How do we make an experimentation friendly environment for developers while remaining secure?\n- How do we handle secrets to connect different applications together? A\n- Finally, some lessons learned from the migration process.\n\nBio:\nJorrick Sleijster\nJorrick is a Data Platform Engineer at Adyen. With a background in computer science his focus has been on introducing and maintaining tools on the data platform. On the side Jorrick is an active open-source contributor to pet projects and Apache Airflow. One of the contributions was awarded with PR-of-the-month of the Apache Airflow project.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1527, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json index 951a6b526..5af9bf2b7 100644 --- a/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json +++ b/pydata-amsterdam-2023/videos/staggered-difference-in-differences-in-practice-causal-insights-from-the-music-industry-pdams-23.json @@ -1,5 +1,5 @@ { - "description": "The Difference-in-Differences (DiD) methodology is a popular causal inference method utilized by leading tech firms such as Microsoft Research, LinkedIn, Meta, and Uber. Yet recent studies suggest that traditional DiD methods may have significant limitations when treatment timings differ. An effective alternative is the implementation of the staggered DiD design. We exemplify this by investigating an interesting question in the music industry: Does featuring a song in TV shows influence its popularity, and are there specific factors that could moderate this impact?\n\nDifference-in-differences (DiD) is a causal inference method frequently used in empirical research in industry and academia. However, standard DiD has limitations when interventions occur at different times or affect varying groups. This talk will highlight the application of the Staggered DiD method, a more nuanced approach that addresses these limitations, in the context of the music industry. We will try to answer the question of how music features in TV shows affect music popularity and how this effect might change for different types of music using the staggered DiD method. Attendees will gain an understanding of causal inference through observational studies and specifically how the new DiD methods are used through an interesting and original case study.\n\nThe talk will be structured as follows:\n\nIntro to the case (e.g., background on music features on TV, dataset)\nExplanation of the DiD approach and its limitations.\nIntroduction to the Staggered DiD method.\nApplication of staggered DiD for the case study from the music industry\nConclusions\nQ&A\nTarget Audience: The talk would be beneficial for data scientists, researchers, and practitioners interested in causal inference, marketing analytics, and quasi-experimental design. Attendees should have a basic understanding of statistical methods used in data science.\n\nKey Takeaways:\n\nUnderstanding of the DiD approach and its limitations in the context of analyses with observational data.\nInsights into the Staggered DiD method and its application.\nPractical knowledge about executing and evaluating DiD studies effectively.\n\nBio:\nNazli M. Alagoz\nI am a quantitative researcher and data scientist with a strong background in marketing, economics, and econometrics. My focus is on using data-driven approaches to tackle complex business challenges, uncover valuable insights, and drive impactful decisions. As a Ph.D. candidate in quantitative marketing, I specialize in causal inference, machine learning, and experimental design to address cutting-edge research questions.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "The Difference-in-Differences (DiD) methodology is a popular causal inference method utilized by leading tech firms such as Microsoft Research, LinkedIn, Meta, and Uber. Yet recent studies suggest that traditional DiD methods may have significant limitations when treatment timings differ. An effective alternative is the implementation of the staggered DiD design. We exemplify this by investigating an interesting question in the music industry: Does featuring a song in TV shows influence its popularity, and are there specific factors that could moderate this impact?\n\nDifference-in-differences (DiD) is a causal inference method frequently used in empirical research in industry and academia. However, standard DiD has limitations when interventions occur at different times or affect varying groups. This talk will highlight the application of the Staggered DiD method, a more nuanced approach that addresses these limitations, in the context of the music industry. We will try to answer the question of how music features in TV shows affect music popularity and how this effect might change for different types of music using the staggered DiD method. Attendees will gain an understanding of causal inference through observational studies and specifically how the new DiD methods are used through an interesting and original case study.\n\nThe talk will be structured as follows:\n\nIntro to the case (e.g., background on music features on TV, dataset)\nExplanation of the DiD approach and its limitations.\nIntroduction to the Staggered DiD method.\nApplication of staggered DiD for the case study from the music industry\nConclusions\nQ&A\nTarget Audience: The talk would be beneficial for data scientists, researchers, and practitioners interested in causal inference, marketing analytics, and quasi-experimental design. Attendees should have a basic understanding of statistical methods used in data science.\n\nKey Takeaways:\n\nUnderstanding of the DiD approach and its limitations in the context of analyses with observational data.\nInsights into the Staggered DiD method and its application.\nPractical knowledge about executing and evaluating DiD studies effectively.\n\nBio:\nNazli M. Alagoz\nI am a quantitative researcher and data scientist with a strong background in marketing, economics, and econometrics. My focus is on using data-driven approaches to tackle complex business challenges, uncover valuable insights, and drive impactful decisions. As a Ph.D. candidate in quantitative marketing, I specialize in causal inference, machine learning, and experimental design to address cutting-edge research questions.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1278, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json index 9322ffb74..ec2c6b0cb 100644 --- a/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json +++ b/pydata-amsterdam-2023/videos/sukel-multimodal-product-demand-forecasting-from-pixels-on-your-screen-to-a-meal-on-your-plate.json @@ -1,5 +1,5 @@ { - "description": "The customers of Picnic use images and texts of products to decide if they like our products, so why not include those data streams in our Temporal Fusion Transformers that we use for Product Demand Forecasting?\n\nJoin us for a thrilling journey through convolutional, graph-based, and transformer-based architectures. Learn about methods to turn images, texts, and geographical information into features for other applications as we did for product demand forecasting. Discover how Picnic Technologies uses state-of-the-art multimodal approaches for demand forecasting to prevent food waste and keep our customers happy!\n\nEver wondered how we keep your favorite brand of potato chips in stock, while that exotic sauce is forever \"currently unavailable\"? We'll reveal the secrets behind these mysteries in our talk on how we are using recent advancements in visual, textual, and contextual information processing techniques to optimize our Product Demand Forecasting. Because everybody loves looking at pictures of groceries but prefers having them available and on their doorstep (delivered for free).\n\nWe begin by shedding light on traditional product demand forecasting - the 'old potatoes' of the industry - and its limitations, like the notorious cold start problem and category dynamics.\n\nOur talk is a must-watch for data scientists, product managers, supply chain wizards, and anyone who has ever been curious about the new innovations in number-crunching that gets your favorite snack from the factory to your front door. If you're in the e-commerce or retail industries, this talk will be as essential as oatmilk and bread in a shopping list. Don\u2019t worry if words like multimodal, temporal, and fusion sound intimidating; They will be explained in a way that is informative and entertaining if you have seen them before but also if you have not.\n\nWe promise it\u2019s not all graphs and matrices \u2013 expect an unexpected rollercoaster ride through the aisle of our digital store. With each turn, you'll discover how our multimodal method uses product images, textual descriptions, and additional contextual information to predict if potatoes will overtake pasta in popularity next month. We'll show you the \u2018cart\u2019 loads of data behind these predictions, putting a fun spin on the world of groceries.\n\nIn the grand finale, we\u2019ll take you behind the scenes of our model's showdown with traditional methods. Spoiler alert: our method doesn\u2019t just predict demand; it leaves the traditional methods looking like overripe bananas in the back of the fridge (which is a bad state for bananas to be in).\n\nThe main takeaway from our talk - besides a craving for potatoes - will be an understanding of multimodal demand forecasting and how all these different types of data are becoming easier and easier to use for real-world business value. By the end of our talk, you'll be filled with ideas (and the sudden need to do groceries with Picnic, you are our target audience: Loving reliability, good products and you have busy jobs), inspired by the potential of multimodal machine learning in forecasting. So, whether you're a data scientist, product manager, or a curious shopper, come along for an enjoyable trip through the world of groceries and demand forecasting!\n\nPrepare your shopping list and join us. Just remember, our model may predict the demand for potatoes, but it's still up to you to remember the dip!\n\nBio:\nMaarten Sukel\nMaarten is a Data Scientist working at Picnic Technologies working mostly on Demand Forecasting and running machine learning at scale. Meanwhile at the University of Amsterdam, he works on research into the use of multimodal approaches for a range of applications.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "The customers of Picnic use images and texts of products to decide if they like our products, so why not include those data streams in our Temporal Fusion Transformers that we use for Product Demand Forecasting?\n\nJoin us for a thrilling journey through convolutional, graph-based, and transformer-based architectures. Learn about methods to turn images, texts, and geographical information into features for other applications as we did for product demand forecasting. Discover how Picnic Technologies uses state-of-the-art multimodal approaches for demand forecasting to prevent food waste and keep our customers happy!\n\nEver wondered how we keep your favorite brand of potato chips in stock, while that exotic sauce is forever \"currently unavailable\"? We'll reveal the secrets behind these mysteries in our talk on how we are using recent advancements in visual, textual, and contextual information processing techniques to optimize our Product Demand Forecasting. Because everybody loves looking at pictures of groceries but prefers having them available and on their doorstep (delivered for free).\n\nWe begin by shedding light on traditional product demand forecasting - the 'old potatoes' of the industry - and its limitations, like the notorious cold start problem and category dynamics.\n\nOur talk is a must-watch for data scientists, product managers, supply chain wizards, and anyone who has ever been curious about the new innovations in number-crunching that gets your favorite snack from the factory to your front door. If you're in the e-commerce or retail industries, this talk will be as essential as oatmilk and bread in a shopping list. Don\u2019t worry if words like multimodal, temporal, and fusion sound intimidating; They will be explained in a way that is informative and entertaining if you have seen them before but also if you have not.\n\nWe promise it\u2019s not all graphs and matrices \u2013 expect an unexpected rollercoaster ride through the aisle of our digital store. With each turn, you'll discover how our multimodal method uses product images, textual descriptions, and additional contextual information to predict if potatoes will overtake pasta in popularity next month. We'll show you the \u2018cart\u2019 loads of data behind these predictions, putting a fun spin on the world of groceries.\n\nIn the grand finale, we\u2019ll take you behind the scenes of our model's showdown with traditional methods. Spoiler alert: our method doesn\u2019t just predict demand; it leaves the traditional methods looking like overripe bananas in the back of the fridge (which is a bad state for bananas to be in).\n\nThe main takeaway from our talk - besides a craving for potatoes - will be an understanding of multimodal demand forecasting and how all these different types of data are becoming easier and easier to use for real-world business value. By the end of our talk, you'll be filled with ideas (and the sudden need to do groceries with Picnic, you are our target audience: Loving reliability, good products and you have busy jobs), inspired by the potential of multimodal machine learning in forecasting. So, whether you're a data scientist, product manager, or a curious shopper, come along for an enjoyable trip through the world of groceries and demand forecasting!\n\nPrepare your shopping list and join us. Just remember, our model may predict the demand for potatoes, but it's still up to you to remember the dip!\n\nBio:\nMaarten Sukel\nMaarten is a Data Scientist working at Picnic Technologies working mostly on Demand Forecasting and running machine learning at scale. Meanwhile at the University of Amsterdam, he works on research into the use of multimodal approaches for a range of applications.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1165, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json index 76c700946..130e10bbc 100644 --- a/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json +++ b/pydata-amsterdam-2023/videos/tables-as-code-the-journey-from-ad-hoc-scripts-to-maintainable-etl-workflows-at-booking-com.json @@ -1,5 +1,5 @@ { - "description": "Until a few years ago, data science & engineering at Booking.com had grown largely in an ad-hoc manner. This growth has led to a labyrinth of unrelated scripts representing Extract-Transform-Load (ETL) processes. Without options for quickly testing cross-application interfaces, maintenance and contribution grew unwieldy, and debugging in production was a common practice.\n\nOver the past several years, we\u2019ve spearheaded a transition from isolated workflows to a well-structured community-maintained monorepo - a task that required not just technical adaptation, but also a cultural shift.\n\nCentral to this transformation is the adoption of the concept of \"tables as code\", an approach that has changed the way we write ETL. Our lightweight PySpark extension represents table metadata as a Python class, exposing data to code, and enabling efficient unit test setup and validation.\n\nIn this talk, we walk you through \u201ctables as code\u201d design and complementary tools such as efficient unit testing, robust telemetry, and automated builds using Bazel. Moreover, we will cover the transformation process, including enabling people with non-engineering backgrounds to create fully tested and maintainable ETL. This includes internal training, maintainers, and support strategies aimed at fostering a community knowledgeable in best practices.\n\nThis talk is aimed at ETL-adjacent data science practitioners, ideally who have been wondering how to push code quality forward at a data-centric organization.\n\nIntroduction (0-5 minutes): We begin by shedding light on the infrastructure that hosted the old scripts, and discuss our motivation for change. It\u2019s worth mentioning that this transformative decision emerged from individual product teams, not from an executive mandate.\nTables as Code (10 minutes): We'll then introduce the concept of 'tables as code', detailing how this approach enables efficient testing.\nMonorepo Transformation (10 minutes): Building on this foundation, we'll explore how 'tables as code' grew into a vast monorepo with thousands of tests. We'll discuss how we scaled our processes and nurtured this project as a community effort.\nCommunity Growth and Future Plans (5 minutes): In our closing segment, we'll share insights gained from growing this project as a community, highlight strategies for orchestrating training, community support, and finally, share our future plans both within and outside our organization.\n\nBios:\nBram van den Akker\nBram van den Akker is a Senior Machine Learning Scientist at Booking.com with a background in Computer Science and Artificial Intelligence from the University of Amsterdam. At Booking.com, Bram has been one of the founders of bkng-data, an internal collection of Python tools aimed at improving code quality, testing, and streamlining CI/CD for data practitioners.\nAside from bkng-data, Bram's work focuses on bridging the gap between applied research and practical requirements for Bandit Feedback all across Booking.com. Previously, Bram has held positions at Shopify, Panasonic & Eagle Eye Networks, and has peer reviewed contributions and tutorials to conferences and workshops such as TheWebConf (WWW), RecSys, and KDD, including a best-paper award.\n\n\nJon Smith\nJon Smith is a Senior Machine Learning Scientist at Booking.com, having spent his time working in fraud detection and performance marketing. In these areas, he focusses on strengthening software practices within critical ML systems, through evangelising code quality and unit testing.\nHe studied Mathematics and Computer Science at Acadia University and Simon Fraser University in Canada, and spent some time as a Machine Learning Engineer at the Canadian Broadcasting Corporation.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Until a few years ago, data science & engineering at Booking.com had grown largely in an ad-hoc manner. This growth has led to a labyrinth of unrelated scripts representing Extract-Transform-Load (ETL) processes. Without options for quickly testing cross-application interfaces, maintenance and contribution grew unwieldy, and debugging in production was a common practice.\n\nOver the past several years, we\u2019ve spearheaded a transition from isolated workflows to a well-structured community-maintained monorepo - a task that required not just technical adaptation, but also a cultural shift.\n\nCentral to this transformation is the adoption of the concept of \"tables as code\", an approach that has changed the way we write ETL. Our lightweight PySpark extension represents table metadata as a Python class, exposing data to code, and enabling efficient unit test setup and validation.\n\nIn this talk, we walk you through \u201ctables as code\u201d design and complementary tools such as efficient unit testing, robust telemetry, and automated builds using Bazel. Moreover, we will cover the transformation process, including enabling people with non-engineering backgrounds to create fully tested and maintainable ETL. This includes internal training, maintainers, and support strategies aimed at fostering a community knowledgeable in best practices.\n\nThis talk is aimed at ETL-adjacent data science practitioners, ideally who have been wondering how to push code quality forward at a data-centric organization.\n\nIntroduction (0-5 minutes): We begin by shedding light on the infrastructure that hosted the old scripts, and discuss our motivation for change. It\u2019s worth mentioning that this transformative decision emerged from individual product teams, not from an executive mandate.\nTables as Code (10 minutes): We'll then introduce the concept of 'tables as code', detailing how this approach enables efficient testing.\nMonorepo Transformation (10 minutes): Building on this foundation, we'll explore how 'tables as code' grew into a vast monorepo with thousands of tests. We'll discuss how we scaled our processes and nurtured this project as a community effort.\nCommunity Growth and Future Plans (5 minutes): In our closing segment, we'll share insights gained from growing this project as a community, highlight strategies for orchestrating training, community support, and finally, share our future plans both within and outside our organization.\n\nBios:\nBram van den Akker\nBram van den Akker is a Senior Machine Learning Scientist at Booking.com with a background in Computer Science and Artificial Intelligence from the University of Amsterdam. At Booking.com, Bram has been one of the founders of bkng-data, an internal collection of Python tools aimed at improving code quality, testing, and streamlining CI/CD for data practitioners.\nAside from bkng-data, Bram's work focuses on bridging the gap between applied research and practical requirements for Bandit Feedback all across Booking.com. Previously, Bram has held positions at Shopify, Panasonic & Eagle Eye Networks, and has peer reviewed contributions and tutorials to conferences and workshops such as TheWebConf (WWW), RecSys, and KDD, including a best-paper award.\n\n\nJon Smith\nJon Smith is a Senior Machine Learning Scientist at Booking.com, having spent his time working in fraud detection and performance marketing. In these areas, he focusses on strengthening software practices within critical ML systems, through evangelising code quality and unit testing.\nHe studied Mathematics and Computer Science at Acadia University and Simon Fraser University in Canada, and spent some time as a Machine Learning Engineer at the Canadian Broadcasting Corporation.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1464, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json index 16e1dbb49..01a7fc218 100644 --- a/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json +++ b/pydata-amsterdam-2023/videos/turning-your-data-ai-algorithms-into-full-web-applications-in-no-time-with-taipy-pdams-2023.json @@ -1,5 +1,5 @@ { - "description": "Numerous packages exist within the Python open-source ecosystem for algorithm building and data visualization. However, a significant challenge persists, with over 85% of Data Science Pilots failing to transition to the production stage.\n\nThis talk introduces Taipy, an open-source Python library for front-end and back-end development. It enables Data Scientists and Python Developers to create pilots and production-ready applications for end-users.\n\nIts syntax facilitates the creation of interactive, customizable, and multi-page dashboards with augmented Markdown. Without the need for web development expertise (no CSS or HTML), users can generate highly interactive interfaces.\n\nAdditionally, Taipy is engineered to construct robust and tailored data-driven back-end applications. Intuitive components like pipelines and data flow orchestration empower users to organize and manage data effectively. Taipy also introduces a unique Scenario Management functionality, facilitating \"what-if\" analysis for data scientists and end-users.\n\nDuring this talk, we will showcase the capabilities of Taipy:\n- to create highly-interactive applications easily without any knowledge in web development.\n- to fill a void within the standard Python back-end stack, offering a powerful solution for data-driven applications.\n\nBios:\nFlorian Jacta\n-Specialist of Taipy, a low-code open-source Python package enabling Python developers to develop a production-ready AI application quickly. Package pre-sales and after-sales function.\n-Data Scientist for Groupe Les Mousquetaires (Intermarche) and ATOS.\n-Developed several Predictive Models as part of strategic AI projects.\n-Master in Applied Mathematics from INSA, Major in Data Science and Mathematical Optimization.\n\nAlexandre Sajus\nAlex worked in Amazon Business Intelligence. He graduated with a Master of Engineering at CentraleSup\u00e9lec - Paris-Saclay University and joined Taipy as a Community Success Consultant. His primary skills are MLOps, Machine Learning, Data Engineering, and Python.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Numerous packages exist within the Python open-source ecosystem for algorithm building and data visualization. However, a significant challenge persists, with over 85% of Data Science Pilots failing to transition to the production stage.\n\nThis talk introduces Taipy, an open-source Python library for front-end and back-end development. It enables Data Scientists and Python Developers to create pilots and production-ready applications for end-users.\n\nIts syntax facilitates the creation of interactive, customizable, and multi-page dashboards with augmented Markdown. Without the need for web development expertise (no CSS or HTML), users can generate highly interactive interfaces.\n\nAdditionally, Taipy is engineered to construct robust and tailored data-driven back-end applications. Intuitive components like pipelines and data flow orchestration empower users to organize and manage data effectively. Taipy also introduces a unique Scenario Management functionality, facilitating \"what-if\" analysis for data scientists and end-users.\n\nDuring this talk, we will showcase the capabilities of Taipy:\n- to create highly-interactive applications easily without any knowledge in web development.\n- to fill a void within the standard Python back-end stack, offering a powerful solution for data-driven applications.\n\nBios:\nFlorian Jacta\n-Specialist of Taipy, a low-code open-source Python package enabling Python developers to develop a production-ready AI application quickly. Package pre-sales and after-sales function.\n-Data Scientist for Groupe Les Mousquetaires (Intermarche) and ATOS.\n-Developed several Predictive Models as part of strategic AI projects.\n-Master in Applied Mathematics from INSA, Major in Data Science and Mathematical Optimization.\n\nAlexandre Sajus\nAlex worked in Amazon Business Intelligence. He graduated with a Master of Engineering at CentraleSup\u00e9lec - Paris-Saclay University and joined Taipy as a Community Success Consultant. His primary skills are MLOps, Machine Learning, Data Engineering, and Python.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1700, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json index 81ef323e7..1bda1a881 100644 --- a/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json +++ b/pydata-amsterdam-2023/videos/using-ai-to-make-amsterdam-greener-safer-and-more-accessible.json @@ -1,5 +1,5 @@ { - "description": "In this talk, we would like to introduce you to the urban challenges that the City of Amsterdam is trying to solve using AI. We will walk you through the technical details behind one of our projects and invite you to join us in the ethical development of cool AI applications for social good.\n\nThe City of Amsterdam has the mission of promoting the development of artificial intelligence to improve the lives of Amsterdam\u2019s residents. We conduct cutting-edge research into the analysis of text, images, and point cloud data, all with the aim of solving the urban challenges of our generation and the ones to come.\n\nRecently, we\u2019ve been working on making our city more inclusive by mapping accessibility infrastructure in the public space. We\u2019ve been also working on making the city safer by localizing all street lights and automatically extracting some of their characteristics. Finally, our analysis of trees and greenery in the city can help increase the city's biodiversity and also help us reach our climate goals.\n\nWorking in the public sector means that technology itself is only a part of our job. On a daily basis, we also need to ensure that all development is done according to our city\u2019s values \u2013 for example, that applications benefit everyone, that we are open and transparent, and that we give citizens a say in shaping their (digital) city. This means (at the very least) that open-source development and the publication of methodology, data, and insights for all of our algorithms are an inseparable part of work.\n\nIn this talk, we would like to introduce you to the challenges that we face, walk you through the technical details behind one of our projects, and share the related open-source materials that can be reused by the PyData community. Finally, we hope to inspire you to join us in the ethical development of cool AI applications for social good.\n\nBios:\nShayla Jansen\nShayla is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team which aims to improve the livability of Amsterdam by bringing AI research to the city.\n\nNiek IJzerman\nNiek is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team. Niek is a recent graduate from the MSc AI program at the UvA and currently focusses on automated asset management in 3D using AI and Data Science.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In this talk, we would like to introduce you to the urban challenges that the City of Amsterdam is trying to solve using AI. We will walk you through the technical details behind one of our projects and invite you to join us in the ethical development of cool AI applications for social good.\n\nThe City of Amsterdam has the mission of promoting the development of artificial intelligence to improve the lives of Amsterdam\u2019s residents. We conduct cutting-edge research into the analysis of text, images, and point cloud data, all with the aim of solving the urban challenges of our generation and the ones to come.\n\nRecently, we\u2019ve been working on making our city more inclusive by mapping accessibility infrastructure in the public space. We\u2019ve been also working on making the city safer by localizing all street lights and automatically extracting some of their characteristics. Finally, our analysis of trees and greenery in the city can help increase the city's biodiversity and also help us reach our climate goals.\n\nWorking in the public sector means that technology itself is only a part of our job. On a daily basis, we also need to ensure that all development is done according to our city\u2019s values \u2013 for example, that applications benefit everyone, that we are open and transparent, and that we give citizens a say in shaping their (digital) city. This means (at the very least) that open-source development and the publication of methodology, data, and insights for all of our algorithms are an inseparable part of work.\n\nIn this talk, we would like to introduce you to the challenges that we face, walk you through the technical details behind one of our projects, and share the related open-source materials that can be reused by the PyData community. Finally, we hope to inspire you to join us in the ethical development of cool AI applications for social good.\n\nBios:\nShayla Jansen\nShayla is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team which aims to improve the livability of Amsterdam by bringing AI research to the city.\n\nNiek IJzerman\nNiek is a data scientist at the City of Amsterdam, part of the dedicated Urban Innovation and R&D Team. Niek is a recent graduate from the MSc AI program at the UvA and currently focusses on automated asset management in 3D using AI and Data Science.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1220, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json index bd6728435..9a4d19831 100644 --- a/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/van-den-bossche-what-the-pdep-an-overview-of-some-upcoming-pandas-changes-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Last year, the pandas community adopted a new process for making significant changes to the library: the Pandas Enhancement Proposals, aka PDEPs (similar to Python's PEPs and numpy's NEPs, ..). In the meantime, several of those proposals have been proposed and discussed, and some already accepted, shaping up the pandas roadmap (https://pandas.pydata.org/about/roadmap.html).\n\nThe goal of this talk is to introduce you to this new process, and give an overview of a few of the proposed PDEPs. This way, you will learn about some of the behavioural changes you can expect as a pandas user in the near future.\n\nOver the many years of development, pandas has grown (or kept since the early days) quite some corner cases and inconsistencies. Some of the proposed PDEPs are an attempt to tackle those? For example, one accepted proposal is to ban any (up)casting in \"setitem-like\" operations, avoiding surprising data type changes. There is also a proposal to stop providing the inplace option for many methods, because even though the name might imply otherwise, those operations were not actually done in-place. Another major change that is under way is a change to the copy and view semantics of operations in pandas (related to the well-known (or hated) SettingWithCopyWarning). This is already available as an experimental opt-in to test and use the new behaviour, and will probably be a highlight of pandas 3.0.\n\nBio:\nJoris Van den Bossche\nI am a core contributor to Pandas and Apache Arrow, and maintainer of GeoPandas. I did a PhD at Ghent University and VITO in air quality research and worked at the Paris-Saclay Center for Data Science. Currently, I work at Voltron Data, contributing to Apache Arrow, and am a freelance teacher of python (pandas) at Ghent University.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Last year, the pandas community adopted a new process for making significant changes to the library: the Pandas Enhancement Proposals, aka PDEPs (similar to Python's PEPs and numpy's NEPs, ..). In the meantime, several of those proposals have been proposed and discussed, and some already accepted, shaping up the pandas roadmap (https://pandas.pydata.org/about/roadmap.html).\n\nThe goal of this talk is to introduce you to this new process, and give an overview of a few of the proposed PDEPs. This way, you will learn about some of the behavioural changes you can expect as a pandas user in the near future.\n\nOver the many years of development, pandas has grown (or kept since the early days) quite some corner cases and inconsistencies. Some of the proposed PDEPs are an attempt to tackle those? For example, one accepted proposal is to ban any (up)casting in \"setitem-like\" operations, avoiding surprising data type changes. There is also a proposal to stop providing the inplace option for many methods, because even though the name might imply otherwise, those operations were not actually done in-place. Another major change that is under way is a change to the copy and view semantics of operations in pandas (related to the well-known (or hated) SettingWithCopyWarning). This is already available as an experimental opt-in to test and use the new behaviour, and will probably be a highlight of pandas 3.0.\n\nBio:\nJoris Van den Bossche\nI am a core contributor to Pandas and Apache Arrow, and maintainer of GeoPandas. I did a PhD at Ghent University and VITO in air quality research and worked at the Paris-Saclay Center for Data Science. Currently, I work at Voltron Data, contributing to Apache Arrow, and am a freelance teacher of python (pandas) at Ghent University.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1608, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json index 393c85914..fe5000d5d 100644 --- a/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/vicki-boykis-keynote-build-and-keep-your-context-window-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "What can we learn from engineering, the history of machine learning, fantasy books, the early 1990s internet, and art history about how to be successful engineers in the modern-day data landscape ? We\u2019ll learn together in this talk.\n\nBios:\nVicki Boykis\nVicki Boykis works on end-to-end ML applications. Her interests include the intersection of information retrieval and large language models, applying engineering best practices to machine learning, and Nutella. She works at Duo Security and she lives in Philadelphia with her family. Her favorite hobby was making terrible jokes on Twitter when it was still good. She recently wrote a deep dive on embeddings and put together Normconf, celebrating normcore workflows in ML.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "What can we learn from engineering, the history of machine learning, fantasy books, the early 1990s internet, and art history about how to be successful engineers in the modern-day data landscape ? We\u2019ll learn together in this talk.\n\nBios:\nVicki Boykis\nVicki Boykis works on end-to-end ML applications. Her interests include the intersection of information retrieval and large language models, applying engineering best practices to machine learning, and Nutella. She works at Duo Security and she lives in Philadelphia with her family. Her favorite hobby was making terrible jokes on Twitter when it was still good. She recently wrote a deep dive on embeddings and put together Normconf, celebrating normcore workflows in ML.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 2276, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json index 5c6bca8c2..152092e92 100644 --- a/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json +++ b/pydata-amsterdam-2023/videos/vincent-warmerdam-keynote-natural-intelligence-is-all-you-need-tm.json @@ -1,5 +1,5 @@ { - "description": "In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future\n\nI've met a lot of authoritative people in my field who pass out advise that sounds like this:\n\nWorking on recommenders? Collect all the data! Sessions!\nWorking on text classification? That's a solved problem! Bert!\nWorking with embeddings? There's a library for that already!\nWorking on tabular data? XGBoost for the win! GridSearch!\nIn short: \"this is how you do data science, don't go and reinvent the wheel\".\n\nIf you spend 5 minutes thinking about \"the invention of the wheel\" though, then you may start to rethink. After all: the wheels on a bike are different from the wheels on an airplane, just like the wheels of a tractor. And for Pete's sake: that's a good thing! If we hadn't reinvented those wheels, we're be stuck with wooden horse carts.\n\nSo ... what might happen if we take the time to rethink a few things?\n\nSpecifically, this keynote will discuss the following topics:\n\ntext classification\nfraud detection\nproduct recommenders\nactive learning\nembeddings\nI hope you'll join me for some new ideas as well as some live demos.\n\nBio:\nVincent Warmerdam\nVincent D. Warmerdam is a software developer and senior data person. He\u2019s currently works over at Explosion to work on data quality tools for developers. He\u2019s also known for creating calmcode.io as well as a bunch of open source projects. You can check out his blog over at koaning.io to learn more about those.\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future\n\nI've met a lot of authoritative people in my field who pass out advise that sounds like this:\n\nWorking on recommenders? Collect all the data! Sessions!\nWorking on text classification? That's a solved problem! Bert!\nWorking with embeddings? There's a library for that already!\nWorking on tabular data? XGBoost for the win! GridSearch!\nIn short: \"this is how you do data science, don't go and reinvent the wheel\".\n\nIf you spend 5 minutes thinking about \"the invention of the wheel\" though, then you may start to rethink. After all: the wheels on a bike are different from the wheels on an airplane, just like the wheels of a tractor. And for Pete's sake: that's a good thing! If we hadn't reinvented those wheels, we're be stuck with wooden horse carts.\n\nSo ... what might happen if we take the time to rethink a few things?\n\nSpecifically, this keynote will discuss the following topics:\n\ntext classification\nfraud detection\nproduct recommenders\nactive learning\nembeddings\nI hope you'll join me for some new ideas as well as some live demos.\n\nBio:\nVincent Warmerdam\nVincent D. Warmerdam is a software developer and senior data person. He\u2019s currently works over at Explosion to work on data quality tools for developers. He\u2019s also known for creating calmcode.io as well as a bunch of open source projects. You can check out his blog over at koaning.io to learn more about those.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 2818, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json index a77375547..eb83e778c 100644 --- a/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json +++ b/pydata-amsterdam-2023/videos/wessel-sandtke-dont-judge-a-book-by-its-cover-using-llm-created-datasets-to-train-models.json @@ -1,5 +1,5 @@ { - "description": "Don\u2019t judge a book by its cover: Using LLM created datasets to train models that detect literary features\n\nExisting book recommendation systems like Goodreads are based on correlating the reading habits of people. But what if you want a humorous book? Or a book that is set in 19th century Paris? Or a thriller, but without violence?\nWe build book recommendation systems for Dutch libraries based on more than a dozen features from historical setting, to writing style, to main character characteristics. This allows us to tailor each recommendation to individual readers.\n\nThe recent developments in LLMs are an interesting area for us to explore to improve our recommendations. However, running LLMs in production is unfortunately not always feasible. The associated costs may be too high, and running code from third parties in your daily pipeline may be undesirable. And then there\u2019s data privacy - or, in our case, intellectual copyright - to be considered as well.\n\nSo how can you reap the benefits of an LLM, without exposing yourself or your company to some of these major downsides?\n\nWe utilized LLMs to generate custom, tailor-made datasets for our literary feature detection models to train on. This allowed us to benefit from the high performance of large language models, without continued reliance on external parties such as OpenAI or Google.\n\nWhile you may think LLMs are not as effective for languages other than English, we\u2019ve seen major improvements in several of our models.\n\nIn this talk, we\u2019ll highlight:\n- A note on recommenders: Why does Goodreads recommender not work for me, while Spotify\u2019s Discover Weekly is so good?\n- Different methods of getting data from books\n- Iterative process of creating a dataset using an LLM and retraining our models\n- Some notes on intellectual property and evaluation of models.\n\nBio:\nWessel Sandtke\nTypewriter repairman turned Machine Learning Engineer, now working for Bookarang, a Dutch startup working with Dutch libraries to improve the recommendations for its members.\nWrote several picture books, but is not allowed to boost those in the recommendation system.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Don\u2019t judge a book by its cover: Using LLM created datasets to train models that detect literary features\n\nExisting book recommendation systems like Goodreads are based on correlating the reading habits of people. But what if you want a humorous book? Or a book that is set in 19th century Paris? Or a thriller, but without violence?\nWe build book recommendation systems for Dutch libraries based on more than a dozen features from historical setting, to writing style, to main character characteristics. This allows us to tailor each recommendation to individual readers.\n\nThe recent developments in LLMs are an interesting area for us to explore to improve our recommendations. However, running LLMs in production is unfortunately not always feasible. The associated costs may be too high, and running code from third parties in your daily pipeline may be undesirable. And then there\u2019s data privacy - or, in our case, intellectual copyright - to be considered as well.\n\nSo how can you reap the benefits of an LLM, without exposing yourself or your company to some of these major downsides?\n\nWe utilized LLMs to generate custom, tailor-made datasets for our literary feature detection models to train on. This allowed us to benefit from the high performance of large language models, without continued reliance on external parties such as OpenAI or Google.\n\nWhile you may think LLMs are not as effective for languages other than English, we\u2019ve seen major improvements in several of our models.\n\nIn this talk, we\u2019ll highlight:\n- A note on recommenders: Why does Goodreads recommender not work for me, while Spotify\u2019s Discover Weekly is so good?\n- Different methods of getting data from books\n- Iterative process of creating a dataset using an LLM and retraining our models\n- Some notes on intellectual property and evaluation of models.\n\nBio:\nWessel Sandtke\nTypewriter repairman turned Machine Learning Engineer, now working for Bookarang, a Dutch startup working with Dutch libraries to improve the recommendations for its members.\nWrote several picture books, but is not allowed to boost those in the recommendation system.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1340, "language": "eng", "recorded": "2023-09-14", diff --git a/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json index 68e08ab25..2dffe352b 100644 --- a/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/zhao-qiao-graph-neural-networks-for-real-world-fraud-detection-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "Fraud is a major problem for financial services companies. As fraudsters change tactics, our detection methods need to get smarter. Graph neural networks (GNNs) are a promising model to improve detection performance. Unlike traditional machine learning models or rule-based engines, GNNs can effectively learn from subtle relationships by aggregating neighborhood information in the financial transaction networks. However, it remains a challenge to adopt this new approach in production.\n\nThe goal of this talk is to share best practices for building a production ready GNN solution and hopefully spark your interest to apply GNNs to your own use cases.\n\nIn this talk, we focus on suspicious account detection for online marketplaces. These platforms allow users to set up shops and sell products with little friction. Unfortunately, this attracts fraudsters who abuse these platforms. We use GNNs to do supervised learning based on accounts previously flagged as fraudulent, so that we can learn from both account properties and the relationship between accounts. However, productionizing GNNs is a big challenge. Addressing this challenge purely using open source packages is the main focus of this talk.\n\nWe first give an overview of GNN-based fraud detection. Then we deep dive into utilizing PySpark and GraphFrames to build a transaction graph in a scalable way and convert it to DGL (Deep Graph Library) format. Next we share our experiences of setting up training and inference graphs in different time intervals, and deploying the end-to-end model pipeline in Airflow.\n\nAttendees are required to have a basic understanding of machine learning. In this informative talk, they will gain insights into fraud detection's challenges and learn best practices to productionize GNNs.\n\nBios:\nFeng Zhao\nFeng is a senior data scientist at Adyen. He is passionate about solving real business problems using innovative AI/machine learning approaches. He received his Ph.D. from the National University of Singapore.\n\nTingting Qiao\nSenior data scientist in Adyen, working in the Score team focusing on fraud detection.\nHaving PhD background in computer vision and natural language processing using deep neural networks. Familiar with prediction models, such as regression, classification models, etc., as well as the latest research techniques, such as adversarial learning, neural networks etc. Several years of experience with popular deep learning frameworks.\n\n===\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "Fraud is a major problem for financial services companies. As fraudsters change tactics, our detection methods need to get smarter. Graph neural networks (GNNs) are a promising model to improve detection performance. Unlike traditional machine learning models or rule-based engines, GNNs can effectively learn from subtle relationships by aggregating neighborhood information in the financial transaction networks. However, it remains a challenge to adopt this new approach in production.\n\nThe goal of this talk is to share best practices for building a production ready GNN solution and hopefully spark your interest to apply GNNs to your own use cases.\n\nIn this talk, we focus on suspicious account detection for online marketplaces. These platforms allow users to set up shops and sell products with little friction. Unfortunately, this attracts fraudsters who abuse these platforms. We use GNNs to do supervised learning based on accounts previously flagged as fraudulent, so that we can learn from both account properties and the relationship between accounts. However, productionizing GNNs is a big challenge. Addressing this challenge purely using open source packages is the main focus of this talk.\n\nWe first give an overview of GNN-based fraud detection. Then we deep dive into utilizing PySpark and GraphFrames to build a transaction graph in a scalable way and convert it to DGL (Deep Graph Library) format. Next we share our experiences of setting up training and inference graphs in different time intervals, and deploying the end-to-end model pipeline in Airflow.\n\nAttendees are required to have a basic understanding of machine learning. In this informative talk, they will gain insights into fraud detection's challenges and learn best practices to productionize GNNs.\n\nBios:\nFeng Zhao\nFeng is a senior data scientist at Adyen. He is passionate about solving real business problems using innovative AI/machine learning approaches. He received his Ph.D. from the National University of Singapore.\n\nTingting Qiao\nSenior data scientist in Adyen, working in the Score team focusing on fraud detection.\nHaving PhD background in computer vision and natural language processing using deep neural networks. Familiar with prediction models, such as regression, classification models, etc., as well as the latest research techniques, such as adversarial learning, neural networks etc. Several years of experience with popular deep learning frameworks.\n\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1453, "language": "eng", "recorded": "2023-09-14", From 077a8ff5dd7707b32f6d97c4dcee1e943b581aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ezequiel=20Leonardo=20Casta=C3=B1o?= <14986783+ELC@users.noreply.github.com> Date: Sat, 28 Jun 2025 00:26:10 -0300 Subject: [PATCH 4/4] Fix conflicting characters --- ...etection-with-a-human-in-the-loop-pydata-amsterdam-2023.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json index 4a98d3c4e..8326dbf01 100644 --- a/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json +++ b/pydata-amsterdam-2023/videos/lieke-kools-standby-detection-with-a-human-in-the-loop-pydata-amsterdam-2023.json @@ -1,5 +1,5 @@ { - "description": "In the Netherlands a large share of energy is used by industry. By measuring the energy usage of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste. It turns out that in most factories, the biggest source of energy waste comes from idling machines. To be able to give valuable insights and provide relevant alerts to our customers, we set up a machine learning system for standby detection with a \u201chuman in the loop\u201d. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem. No background knowledge is required for this talk.\n\nIn the Netherlands a large share of energy is used by industry (less than 40% compared to only 14% used by households*). Eliminating energy waste in this sector is a big step forward towards a greener future. Therefore, Sensorfact made it its mission to eliminate all industrial energy waste. By measuring the energy usage (electricity or gas) of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste.\n\nIt turns out that in most factories, the biggest source of energy waste comes from forgetting to turn off machines when they are not used. Flagging idling machines based on their electricity usage may seem like a trivial problem at first, however the large variety in machines and production processes makes this a lot harder than you would expect. To be able to give valuable insights on idling machines and provide relevant alerts to our customers, we set up a machine learning system with a \u201chuman in the loop\u201d.\n\nIn many settings it is perfectly fine to embed a machine learning model in a process without any human interference. However, there are cases where it is better to keep a human in the loop. The most obvious use cases are those where there is simply no room for error, for example in medical applications. However, also in less life threatening it can be beneficial to have a human act as gatekeeper ensuring high quality outputs. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem, using the case of standby detection. We will share learnings from our own experience and along the way give you an overview of the (open source) tools we chose to use for the different facets of the project.\n\nNo background knowledge is required for this talk. If you are looking for inspiration on how to build a machine learning system with a human in the loop or if you are curious about sustainability use cases this talk may be interesting for you.\n\n*https://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector\n\nBio:\nLieke Kools\nLieke is lead data scientist at Sensorfact, a company aiming to eliminate all industrial energy waste for SME\u2019s. In her role she focusses on the data fueled products that help their consultants to efficiently and effectively give advice to customers. Before joining Sensorfact she worked as a data science consultant at Vantage AI and completed a PhD in econometrics.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", + "description": "In the Netherlands a large share of energy is used by industry. By measuring the energy usage of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste. It turns out that in most factories, the biggest source of energy waste comes from idling machines. To be able to give valuable insights and provide relevant alerts to our customers, we set up a machine learning system for standby detection with a \u201chuman in the loop\u201d. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem. No background knowledge is required for this talk.\n\nIn the Netherlands a large share of energy is used by industry (less than 40% compared to only 14% used by households). Eliminating energy waste in this sector is a big step forward towards a greener future. Therefore, Sensorfact made it its mission to eliminate all industrial energy waste. By measuring the energy usage (electricity or gas) of individual machines in real time it is possible to pinpoint when machines are operating inefficiently and help factories take measures to reduce energy waste.\n\nIt turns out that in most factories, the biggest source of energy waste comes from forgetting to turn off machines when they are not used. Flagging idling machines based on their electricity usage may seem like a trivial problem at first, however the large variety in machines and production processes makes this a lot harder than you would expect. To be able to give valuable insights on idling machines and provide relevant alerts to our customers, we set up a machine learning system with a \u201chuman in the loop\u201d.\n\nIn many settings it is perfectly fine to embed a machine learning model in a process without any human interference. However, there are cases where it is better to keep a human in the loop. The most obvious use cases are those where there is simply no room for error, for example in medical applications. However, also in less life threatening it can be beneficial to have a human act as gatekeeper ensuring high quality outputs. In this talk we will go over the considerations that go into setting up a machine learning system with a human in the loop and showcase our approach to the problem, using the case of standby detection. We will share learnings from our own experience and along the way give you an overview of the (open source) tools we chose to use for the different facets of the project.\n\nNo background knowledge is required for this talk. If you are looking for inspiration on how to build a machine learning system with a human in the loop or if you are curious about sustainability use cases this talk may be interesting for you.\n\nhttps://www.clo.nl/indicatoren/nl0052-energieverbruik-per-sector\n\nBio:\nLieke Kools\nLieke is lead data scientist at Sensorfact, a company aiming to eliminate all industrial energy waste for SME\u2019s. In her role she focusses on the data fueled products that help their consultants to efficiently and effectively give advice to customers. Before joining Sensorfact she worked as a data science consultant at Vantage AI and completed a PhD in econometrics.\n\n\nwww.pydata.org\n\nPyData is an educational program of NumFOCUS, a 501(c)3 non-profit organization in the United States. PyData provides a forum for the international community of users and developers of data analysis tools to share ideas and learn from each other. The global PyData network promotes discussion of best practices, new approaches, and emerging technologies for data management, processing, analytics, and visualization. PyData communities approach data science using many languages, including (but not limited to) Python, Julia, and R. \n\nPyData conferences aim to be accessible and community-driven, with novice to advanced level presentations. PyData tutorials and talks bring attendees the latest project features along with cutting-edge use cases.\n\n00:00 Welcome!\n00:10 Help us add time stamps or captions to this video! See the description for details.\n\nWant to help add timestamps to our YouTube videos to help with discoverability? Find out more here: https://github.com/numfocus/YouTubeVideoTimestamps", "duration": 1220, "language": "eng", "recorded": "2023-09-14",