Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions youtube2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,28 @@ def main():
dest="collection_type",
)
parser.add_argument(
"--id", help="Youtube ID of the collection", required=True, dest="youtube_id"
"--subset-by", help="Subset of collection to download",
choices = ["recent", "views", "views-per-year"],
default="recent",
dest="subset_by",
)
parser.add_argument(
"--subset-videos",
help="Maximum number of videos to download",
type=int,
dest="subset_videos",
)
parser.add_argument(
"--subset-gb",
help="Cumulative size of videos to download (in GB)",
type=float,
default = 0,
dest="subset_gb",
)
parser.add_argument(
"--id", help="Youtube ID of the collection",
required=True,
dest="youtube_id",
)
parser.add_argument("--api-key", help="Youtube API Token", required=True)
parser.add_argument(
Expand Down Expand Up @@ -119,7 +140,7 @@ def main():

parser.add_argument(
"--creator",
help="Name of content creator. Defaults to Channel name or “Youtue Channels”",
help="Name of content creator. Defaults to Channel name or “Youtube Channels”",
)

parser.add_argument(
Expand Down Expand Up @@ -214,8 +235,10 @@ def main():

args = parser.parse_args()
logger.setLevel(logging.DEBUG if args.debug else logging.INFO)


try:
# Check for invalid values
if args.max_concurrency < 1:
raise ValueError(f"Invalid concurrency value: {args.max_concurrency}")
scraper = Youtube2Zim(**dict(args._get_kwargs()), youtube_store=YOUTUBE)
Expand All @@ -226,6 +249,5 @@ def main():
logger.exception(exc)
return 1


if __name__ == "__main__":
sys.exit(main())
38 changes: 36 additions & 2 deletions youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
get_channel_json,
get_videos_authors_info,
get_videos_json,
subset_videos_json,
replace_titles,
save_channel_branding,
skip_deleted_videos,
Expand All @@ -65,6 +66,9 @@ def __init__(
self,
collection_type,
youtube_id,
subset_by,
subset_videos,
subset_gb,
api_key,
video_format,
low_quality,
Expand Down Expand Up @@ -101,6 +105,9 @@ def __init__(
self.youtube_id = youtube_id
self.api_key = api_key
self.dateafter = dateafter
self.subset_by = subset_by
self.subset_videos = subset_videos
self.subset_gb = subset_gb

# video-encoding info
self.video_format = video_format
Expand Down Expand Up @@ -474,7 +481,7 @@ def extract_videos_list(self):
# we only return video_ids that we'll use later on. per-playlist JSON stored
for playlist in self.playlists:
videos_json = get_videos_json(playlist.playlist_id)
# filter in videos within date range and filter away deleted videos

# we replace videos titles if --custom-titles is used
if self.custom_titles:
replace_titles(videos_json, self.custom_titles)
Expand All @@ -488,7 +495,34 @@ def extract_videos_list(self):
{v["contentDetails"]["videoId"]: v for v in filter_videos}
)
save_json(self.cache_dir, "videos", all_videos)
self.videos_ids = [*all_videos.keys()] # unpacking so it's subscriptable

if self.subset_by or self.subset_videos or self.subset_gb:
all_videos = subset_videos_json(
all_videos, self.subset_by, self.subset_videos, self.subset_gb
)
all_videos = {v["contentDetails"]["videoId"]: v for v in all_videos}
save_json(self.cache_dir, "videos", all_videos)

self.playlists[0].videos = all_videos
self.playlists[0].videos_count = len(all_videos)
self.playlists = self.playlists[:1]
for i, p in enumerate(self.playlists):
p.position = i

playlist_json = {
"playlist_id": self.playlists[0].playlist_id,
"title": self.playlists[0].title,
"videos_count": self.playlists[0].videos_count,
"videos": list(all_videos.values()),
}
# update the positions of the videos
for i, v in enumerate(playlist_json["videos"]):
v["position"] = i

save_json(self.cache_dir, f"playlist_{self.playlists[0].playlist_id}", playlist_json)

self.videos_ids = [*all_videos.keys()]


def download_video_files(self, max_concurrency):

Expand Down
83 changes: 76 additions & 7 deletions youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
# vim: ai ts=4 sts=4 et sw=4 nu

import requests
import yt_dlp

from contextlib import ExitStack
from dateutil import parser as dt_parser
from datetime import datetime
from pytube import extract
from zimscraperlib.download import stream_file
from zimscraperlib.image.transformation import resize_image
Expand Down Expand Up @@ -175,7 +178,7 @@ def get_videos_json(playlist_id):
PLAYLIST_ITEMS_API,
params={
"playlistId": playlist_id,
"part": "snippet,contentDetails",
"part": "snippet,contentDetails,status",
"key": YOUTUBE.api_key,
"maxResults": RESULTS_PER_PAGE,
"pageToken": page_token,
Expand All @@ -193,6 +196,76 @@ def get_videos_json(playlist_id):
save_json(YOUTUBE.cache_dir, fname, items)
return items

def subset_videos_json(videos, subset_by, subset_videos, subset_gb):
"""filter the videos by a subset of videos"""
options = {
"ignoreerrors": True,
}
# query the youtube api for the video statistics
video_ids = [video["contentDetails"]["videoId"] for video in videos.values()]
video_stats = {}
for i in range(0, len(video_ids), 50):
video_ids_chunk = video_ids[i : i + 50]
req = requests.get(
VIDEOS_API,
params={
"id": ",".join(video_ids_chunk),
"part": "statistics",
"key": YOUTUBE.api_key,
},
)
if req.status_code > 400:
logger.error(f"HTTP {req.status_code} Error response: {req.text}")
req.raise_for_status()
video_stats_json = req.json()
for video in video_stats_json["items"]:
video_stats[video["id"]] = video["statistics"]
# we add the statistics to the videos
for video in videos.values():
video["statistics"] = video_stats[video["contentDetails"]["videoId"]]
# we sort the videos by views or recent or views-per-year
if subset_by == "views":
videos = list(videos.values())
videos = sorted(videos, key=lambda video: video["statistics"]["viewCount"], reverse=True)
elif subset_by == "recent":
videos = list(videos.values())
videos = sorted(videos, key=lambda video: video["snippet"]["publishedAt"], reverse=True)
elif subset_by == "views-per-year":
for video in videos.values():
views = video["statistics"]["viewCount"]
published_at = video["snippet"]["publishedAt"]
now = datetime.now()
published_at = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")
years = now.year - published_at.year
video["statistics"]["views_per_year"] = int(views) / (years + 1)
videos = list(videos.values())
videos = sorted(videos, key=lambda video: video["statistics"]["views_per_year"], reverse=True)
if subset_videos != 0:
videos_ids = [video["contentDetails"]["videoId"] for video in videos]
videos_ids_subset = videos_ids[:subset_videos]
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids_subset]
if subset_gb != 0:
total_size = 0
videos_ids_subset = []
for video in videos:
video_id = video["contentDetails"]["videoId"]
video_size = yt_dlp.YoutubeDL(options).extract_info(
video_id, download=False
)["filesize_approx"] / 1024 / 1024 / 1024
if total_size + video_size <= subset_gb:
total_size += video_size
videos_ids_subset.append(video_id)
if video_id == videos[-1]["contentDetails"]["videoId"]:
videos_ids = videos_ids_subset
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
break
else:
videos_ids = videos_ids_subset
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
break
return videos


# Replace some video titles reading 2 text files, one for the video id and one for the title (called with --custom-titles)
def replace_titles(items, custom_titles):
"""replace video titles with custom titles from file"""
Expand All @@ -216,18 +289,13 @@ def replace_titles(items, custom_titles):
with ExitStack() as stack:
files = [stack.enter_context(open(fname)) for fname in custom_titles_files]
for f in files:
# log the number of lines in each file
logger.debug(f"found {len(f.readlines())} custom titles in {f.name}")
# reset the file pointer to the beginning of the file
f.seek(0)
# iterate through the lines in the file
for line in f:
if line.startswith("https://"):
# if the line starts with https://, extract the video id from the url
ids.append(extract.video_id(line))
logger.debug(f"found video id {ids[-1]}")
else:
# otherwise, append the line to the titles list
titles.append(line.rstrip())
logger.debug(f"found title {titles[-1]}")

Expand Down Expand Up @@ -341,10 +409,11 @@ def save_channel_branding(channels_dir, channel_id, save_banner=False):


def skip_deleted_videos(item):
"""filter func to filter-out deleted videos from list"""
"""filter func to filter-out deleted, unavailable or private videos"""
return (
item["snippet"]["title"] != "Deleted video"
and item["snippet"]["description"] != "This video is unavailable."
and item["status"]["privacyStatus"] != "private"
)


Expand Down