Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions jobspy/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
from jobspy.linkedin.util import (
is_job_remote,
job_type_code,
parse_job_datetime,
parse_job_type,
parse_job_level,
parse_company_industry
parse_company_industry,
)
from jobspy.model import (
JobPost,
Expand Down Expand Up @@ -51,7 +52,10 @@ class LinkedIn(Scraper):
jobs_per_page = 25

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self,
proxies: list[str] | str | None = None,
ca_cert: str | None = None,
user_agent: str | None = None,
):
"""
Initializes LinkedInScraper with the LinkedIn job search url
Expand Down Expand Up @@ -213,13 +217,15 @@ def _process_job(
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate--new"
)

date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
time_posted = None
if datetime_tag:
datetime_posted = parse_job_datetime(datetime_tag)
if datetime_posted:
date_posted = datetime_posted.date()
time_posted = datetime_posted.time()

job_details = {}
if full_descr:
job_details = self._get_job_details(job_id)
Expand All @@ -234,6 +240,7 @@ def _process_job(
location=location,
is_remote=is_remote,
date_posted=date_posted,
time_posted=time_posted,
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
Expand Down
19 changes: 18 additions & 1 deletion jobspy/linkedin/util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from datetime import datetime

from bs4 import BeautifulSoup

from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type

import dateparser


def job_type_code(job_type_enum: JobType) -> str:
return {
Expand Down Expand Up @@ -85,12 +89,25 @@ def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
return industry


def parse_job_datetime(soup_datetime: BeautifulSoup) -> datetime:
try:
text = soup_datetime.get_text(strip=True)
parsed_datetime = dateparser.parse(text)
if not parsed_datetime:
parsed_datetime = datetime.strptime(
soup_datetime.get("datetime"), "%Y-%m-%d"
)
return parsed_datetime
except:
return None


def is_job_remote(title: dict, description: str, location: Location) -> bool:
"""
Searches the title, location, and description to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
location = location.display_location()
full_string = f'{title} {description} {location}'.lower()
full_string = f"{title} {description} {location}".lower()
is_remote = any(keyword in full_string for keyword in remote_keywords)
return is_remote
25 changes: 17 additions & 8 deletions jobspy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import ABC, abstractmethod
from typing import Optional
from datetime import date
from datetime import date, time
from enum import Enum
from pydantic import BaseModel

Expand Down Expand Up @@ -236,6 +236,7 @@ class DescriptionFormat(Enum):
HTML = "html"
PLAIN = "plain"


class JobPost(BaseModel):
id: str | None = None
title: str
Expand All @@ -251,6 +252,7 @@ class JobPost(BaseModel):
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
time_posted: time | None = None
emails: list[str] | None = None
is_remote: bool | None = None
listing_type: str | None = None
Expand All @@ -273,12 +275,15 @@ class JobPost(BaseModel):
job_function: str | None = None

# Naukri specific
skills: list[str] | None = None #from tagsAndSkills
experience_range: str | None = None #from experienceText
company_rating: float | None = None #from ambitionBoxData.AggregateRating
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
vacancy_count: int | None = None #from vacancy
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
skills: list[str] | None = None # from tagsAndSkills
experience_range: str | None = None # from experienceText
company_rating: float | None = None # from ambitionBoxData.AggregateRating
company_reviews_count: int | None = None # from ambitionBoxData.ReviewsCount
vacancy_count: int | None = None # from vacancy
work_from_home_type: str | None = (
None # from clusters.wfhType (e.g., "Hybrid", "Remote")
)


class JobResponse(BaseModel):
jobs: list[JobPost] = []
Expand Down Expand Up @@ -324,7 +329,11 @@ class ScraperInput(BaseModel):

class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
self,
site: Site,
proxies: list[str] | None = None,
ca_cert: str | None = None,
user_agent: str | None = None,
):
self.site = site
self.proxies = proxies
Expand Down
7 changes: 5 additions & 2 deletions jobspy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,15 @@ def markdown_converter(description_html: str):
markdown = md(description_html)
return markdown.strip()

def plain_converter(decription_html:str):

def plain_converter(decription_html: str):
from bs4 import BeautifulSoup

if decription_html is None:
return None
soup = BeautifulSoup(decription_html, "html.parser")
text = soup.get_text(separator=" ")
text = re.sub(r'\s+',' ',text)
text = re.sub(r"\s+", " ", text)
return text.strip()


Expand Down Expand Up @@ -333,6 +335,7 @@ def convert_to_annual(job_data: dict):
"company",
"location",
"date_posted",
"time_posted",
"job_type",
"salary_source",
"interval",
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pydantic = "^2.3.0"
tls-client = "^1.0.1"
markdownify = "^1.1.0"
regex = "^2024.4.28"
dateparser = "^1.2.2"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
Expand Down