Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 225 additions & 18 deletions internal/parsers/pypi/pypi-parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,114 @@ import (
"github.com/Checkmarx/manifest-parser/pkg/parser/models"
)

// PypiParser implements parsing of requirements.txt
// PypiParser implements parsing of requirements.txt and related Python dependency files.
// Supports formats generated by pip freeze, pip-compile, pip-tools, uv export, and Poetry export.
type PypiParser struct{}

// logicalLine represents a single dependency entry that may span multiple physical lines
// when line continuations (\) are used.
type logicalLine struct {
content string // joined and hash-stripped content
firstLine int // 0-indexed line number of the first physical line
rawFirst string // raw text of the first physical line (for index computation)
}

// pipOptionPrefixes lists prefixes of pip CLI option lines that should be skipped.
var pipOptionPrefixes = []string{
"-i ", "--index-url", "--extra-index-url",
"-r ", "--requirement",
"-c ", "--constraint",
"-e ", "--editable",
"-f ", "--find-links",
"--no-binary", "--only-binary",
"--pre", "--trusted-host",
"--hash=",
}

// isPipOptionLine returns true if the trimmed line is a pip CLI option rather than a package spec.
func isPipOptionLine(trimmed string) bool {
for _, prefix := range pipOptionPrefixes {
if strings.HasPrefix(trimmed, prefix) {
return true
}
}
return false
}

// stripHashOptions removes --hash=<value> tokens from a line.
func stripHashOptions(line string) string {
tokens := strings.Fields(line)
var filtered []string
for _, tok := range tokens {
if !strings.HasPrefix(tok, "--hash=") {
filtered = append(filtered, tok)
}
}
return strings.Join(filtered, " ")
}

// preprocessLines joins physical lines connected by trailing backslashes into logical lines,
// and strips --hash= options from the result.
func preprocessLines(lines []string) []logicalLine {
var result []logicalLine
var accumulator []string
firstLine := -1
rawFirst := ""

for i, raw := range lines {
trimmed := strings.TrimSpace(raw)

if firstLine == -1 {
firstLine = i
rawFirst = raw
}

if strings.HasSuffix(trimmed, "\\") {
// Strip the trailing backslash and accumulate
trimmed = strings.TrimSuffix(trimmed, "\\")
trimmed = strings.TrimSpace(trimmed)
if trimmed != "" {
accumulator = append(accumulator, trimmed)
}
continue
}

// Line does not end with \, so this completes the logical line
if trimmed != "" {
accumulator = append(accumulator, trimmed)
}

joined := strings.Join(accumulator, " ")
joined = stripHashOptions(joined)
joined = strings.TrimSpace(joined)

result = append(result, logicalLine{
content: joined,
firstLine: firstLine,
rawFirst: rawFirst,
})

// Reset for next logical line
accumulator = nil
firstLine = -1
rawFirst = ""
}

// Handle any remaining accumulated content (file ended with \)
if len(accumulator) > 0 {
joined := strings.Join(accumulator, " ")
joined = stripHashOptions(joined)
joined = strings.TrimSpace(joined)
result = append(result, logicalLine{
content: joined,
firstLine: firstLine,
rawFirst: rawFirst,
})
}

return result
}

func extractPackageName(line string, re *regexp.Regexp, lineNum int, manifestFile string) (string, bool) {
if match := re.FindStringSubmatch(line); match != nil {
return match[1], true
Expand All @@ -24,6 +129,13 @@ func extractPackageName(line string, re *regexp.Regexp, lineNum int, manifestFil
func extractVersion(line string) string {
var version string
switch {
case strings.Contains(line, "==="):
parts := strings.SplitN(line, "===", 2)
if len(parts) == 2 {
version = strings.TrimSpace(parts[1])
} else {
version = "latest"
}
case strings.Contains(line, "=="):
parts := strings.SplitN(line, "==", 2)
if len(parts) == 2 {
Expand All @@ -40,6 +152,52 @@ func extractVersion(line string) string {
return version
}

// vcsSchemes lists VCS prefixes used in pip requirements.
var vcsSchemes = []string{"git+", "hg+", "svn+", "bzr+"}

// isVCSRequirement returns true if the line is a VCS-based requirement.
func isVCSRequirement(line string) bool {
for _, scheme := range vcsSchemes {
if strings.HasPrefix(line, scheme) {
return true
}
}
return false
}

// extractVCSPackageName extracts the package name from a VCS requirement line
// using the #egg=<name> fragment. Returns empty string if not found.
func extractVCSPackageName(line string) string {
if idx := strings.Index(line, "#egg="); idx >= 0 {
egg := line[idx+5:]
// egg name may be followed by & or whitespace
if ampIdx := strings.IndexAny(egg, "& \t"); ampIdx >= 0 {
egg = egg[:ampIdx]
}
return strings.TrimSpace(egg)
}
return ""
}

// isURLRequirement returns true if the line contains a PEP 508 URL requirement (pkg @ URL).
func isURLRequirement(line string) bool {
return strings.Contains(line, " @ ")
}

// extractURLPackageName extracts the package name from a URL requirement (pkg @ https://...).
func extractURLPackageName(line string) string {
parts := strings.SplitN(line, " @ ", 2)
if len(parts) == 2 {
name := strings.TrimSpace(parts[0])
// Strip extras like pkg[extra] → pkg
if bracketIdx := strings.Index(name, "["); bracketIdx >= 0 {
name = name[:bracketIdx]
}
return name
}
return ""
}

func computeIndices(raw, pkgName string) (int, int) {
// Find the start index of the package name
startIdx := strings.Index(raw, pkgName)
Expand Down Expand Up @@ -74,19 +232,55 @@ func (p *PypiParser) Parse(manifestFile string) ([]models.Package, error) {
}
defer file.Close()

var packages []models.Package
// Read all lines into a slice
var lines []string
scanner := bufio.NewScanner(file)
lineNum := 0
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, err
}

// Preprocess: join continuation lines and strip hash options
logicalLines := preprocessLines(lines)

var packages []models.Package
re := regexp.MustCompile(`^([a-zA-Z0-9_\-\.]+)(?:\[.*\])?(?:[>=<!~,\s].*)?$`)

for scanner.Scan() {
raw := scanner.Text()
line := strings.TrimSpace(raw)
for _, ll := range logicalLines {
line := ll.content
if line == "" || strings.HasPrefix(line, "#") {
lineNum++
continue
}
if isPipOptionLine(line) {
continue
}

// Check VCS requirements before stripping # comments,
// since VCS URLs use # as a fragment separator (e.g. #egg=pkg)
if isVCSRequirement(line) {
pkgName := extractVCSPackageName(line)
if pkgName == "" {
log.Printf("Skipping VCS line %d in %s: no #egg= fragment found", ll.firstLine, manifestFile)
continue
}
rawForIndices := strings.TrimRight(ll.rawFirst, " \t\\")
startCol, endCol := computeIndices(rawForIndices, pkgName)
packages = append(packages, models.Package{
PackageManager: "pypi",
PackageName: pkgName,
Version: "latest",
FilePath: manifestFile,
Locations: []models.Location{{
Line: ll.firstLine,
StartIndex: startCol,
EndIndex: endCol,
}},
})
continue
}

if strings.Contains(line, "#") {
line = strings.SplitN(line, "#", 2)[0]
line = strings.TrimSpace(line)
Expand All @@ -96,30 +290,43 @@ func (p *PypiParser) Parse(manifestFile string) ([]models.Package, error) {
line = strings.TrimSpace(line)
}

pkgName, ok := extractPackageName(line, re, lineNum, manifestFile)
if !ok {
lineNum++
continue
var pkgName string
var version string

switch {
case isURLRequirement(line):
// URL requirement: package @ https://...
pkgName = extractURLPackageName(line)
if pkgName == "" {
log.Printf("Skipping URL line %d in %s: no package name found", ll.firstLine, manifestFile)
continue
}
version = "latest"
default:
var ok bool
pkgName, ok = extractPackageName(line, re, ll.firstLine, manifestFile)
if !ok {
continue
}
version = extractVersion(line)
}
version := extractVersion(line)
startCol, endCol := computeIndices(raw, pkgName)

// Strip trailing backslash from the raw first line before computing indices
rawForIndices := strings.TrimRight(ll.rawFirst, " \t\\")
startCol, endCol := computeIndices(rawForIndices, pkgName)

packages = append(packages, models.Package{
PackageManager: "pypi",
PackageName: pkgName,
Version: version,
FilePath: manifestFile,
Locations: []models.Location{{
Line: lineNum,
Line: ll.firstLine,
StartIndex: startCol,
EndIndex: endCol,
}},
})
lineNum++
}

if err := scanner.Err(); err != nil {
return nil, err
}
return packages, nil
}
Loading
Loading