Skip to content

Commit 66ef6c9

Browse files
committed
Fix: Weigh programming language tallies by file size (bytes) instead of file count
Previously, language summaries were calculated based on the number of files, causing languages with many small files (e.g., C header files) to incorrectly appear as the primary language over languages with fewer but larger files (e.g., C++ source). This change uses file size (bytes) as the weight for the tally, ensuring the primary language reflects the code mass rather than file count. Fallback to count=1 if size is 0.
1 parent 022ddc8 commit 66ef6c9

1 file changed

Lines changed: 36 additions & 21 deletions

File tree

src/summarycode/tallies.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -220,35 +220,50 @@ def tally_licenses(license_expressions):
220220
def language_tallies(resource, children, keep_details=False):
221221
"""
222222
Populate a programming_language tallies list of mappings such as
223-
{value: "programming_language", count: "count of occurences"}
224-
sorted by decreasing count.
223+
{value: "programming_language", count: "sum of bytes"}
224+
sorted by decreasing size.
225+
226+
If the file size is 0 or missing, we fallback to a count of 1
227+
to ensure the file is still represented in the tallies.
225228
"""
226229
PROG_LANG = 'programming_language'
227-
languages = []
228-
prog_lang = getattr(resource, PROG_LANG , [])
229-
if not prog_lang:
230-
if resource.is_file:
231-
# also count files with no detection
232-
languages.append(None)
233-
else:
234-
languages.append(prog_lang)
235-
236-
# Collect direct children expression summaries
230+
scores = {}
231+
232+
# 1. Get data for the current file
233+
prog_lang = getattr(resource, PROG_LANG, [])
234+
current_size = getattr(resource, 'size', 0) or 0
235+
236+
# Hybrid Logic: Use size (bytes) if available, otherwise 1 (vote count)
237+
weight = current_size if current_size > 0 else 1
238+
239+
if resource.is_file and prog_lang:
240+
# Handle if prog_lang is a single string or a list
241+
langs = prog_lang if isinstance(prog_lang, list) else [prog_lang]
242+
243+
for lang in langs:
244+
scores[lang] = scores.get(lang, 0) + weight
245+
246+
# 2. Aggregate from children (Bubble up the byte counts)
237247
for child in children:
238248
child_tallies = get_resource_tallies(child, key=PROG_LANG, as_attribute=keep_details) or []
239249
for child_tally in child_tallies:
240-
child_sum_val = child_tally.get('value')
241-
if child_sum_val:
242-
values = [child_sum_val] * child_tally['count']
243-
languages.extend(values)
244-
245-
# summarize proper
246-
languages_counter = tally_languages(languages)
247-
tallied = sorted_counter(languages_counter)
250+
val = child_tally.get('value')
251+
child_count = child_tally.get('count', 0)
252+
253+
if val:
254+
scores[val] = scores.get(val, 0) + child_count
255+
256+
# 3. Format the results
257+
tallied = []
258+
for lang, score in scores.items():
259+
tallied.append({'value': lang, 'count': score})
260+
261+
# Sort by size (Biggest language first)
262+
tallied.sort(key=lambda x: x['count'], reverse=True)
263+
248264
set_resource_tallies(resource, key=PROG_LANG, value=tallied, as_attribute=keep_details)
249265
return tallied
250266

251-
252267
def tally_languages(languages):
253268
"""
254269
Given a list of languages, return a mapping of {language: count

0 commit comments

Comments
 (0)