Skip to content

Commit 607d2c3

Browse files
Merge pull request #30746 from FazeelUsmani/fix-matplotlib-pdf-scatter
Fix PDF bloat for off-axis scatter with per-point colors
2 parents 9abdac2 + 3ef8b23 commit 607d2c3

File tree

2 files changed

+257
-0
lines changed

2 files changed

+257
-0
lines changed

lib/matplotlib/backends/backend_pdf.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,11 +2104,28 @@ def draw_path_collection(self, gc, master_transform, paths, all_transforms,
21042104

21052105
padding = np.max(linewidths)
21062106
path_codes = []
2107+
path_extents = []
21072108
for i, (path, transform) in enumerate(self._iter_collection_raw_paths(
21082109
master_transform, paths, all_transforms)):
21092110
name = self.file.pathCollectionObject(
21102111
gc, path, transform, padding, filled, stroked)
21112112
path_codes.append(name)
2113+
# Compute the extent of each marker path to enable per-marker
2114+
# bounds checking. This allows us to skip markers that are
2115+
# completely outside the visible canvas while preserving markers
2116+
# that are partially visible.
2117+
if len(path.vertices):
2118+
bbox = path.get_extents(transform)
2119+
# Store half-width and half-height for efficient bounds checking
2120+
path_extents.append((bbox.width / 2, bbox.height / 2))
2121+
else:
2122+
path_extents.append((0, 0))
2123+
2124+
# Create a mapping from path_id to extent for efficient lookup
2125+
path_extent_map = dict(zip(path_codes, path_extents))
2126+
2127+
canvas_width = self.file.width * 72
2128+
canvas_height = self.file.height * 72
21122129

21132130
output = self.file.output
21142131
output(*self.gc.push())
@@ -2118,6 +2135,28 @@ def draw_path_collection(self, gc, master_transform, paths, all_transforms,
21182135
facecolors, edgecolors, linewidths, linestyles,
21192136
antialiaseds, urls, offset_position, hatchcolors=hatchcolors):
21202137

2138+
# Optimization: Fast path for markers with centers inside canvas.
2139+
# This avoids the dictionary lookup for the common case where
2140+
# markers are visible, improving performance for large scatter plots.
2141+
if 0 <= xo <= canvas_width and 0 <= yo <= canvas_height:
2142+
# Marker center is inside canvas - definitely render it
2143+
self.check_gc(gc0, rgbFace)
2144+
dx, dy = xo - lastx, yo - lasty
2145+
output(1, 0, 0, 1, dx, dy, Op.concat_matrix, path_id,
2146+
Op.use_xobject)
2147+
lastx, lasty = xo, yo
2148+
continue
2149+
2150+
# Marker center is outside canvas - check if partially visible.
2151+
# Skip markers completely outside visible canvas bounds to reduce
2152+
# PDF file size. Use per-marker extents to handle large markers
2153+
# correctly: only skip if the marker's bounding box doesn't
2154+
# intersect the canvas at all.
2155+
extent_x, extent_y = path_extent_map[path_id]
2156+
if not (-extent_x <= xo <= canvas_width + extent_x
2157+
and -extent_y <= yo <= canvas_height + extent_y):
2158+
continue
2159+
21212160
self.check_gc(gc0, rgbFace)
21222161
dx, dy = xo - lastx, yo - lasty
21232162
output(1, 0, 0, 1, dx, dy, Op.concat_matrix, path_id,

lib/matplotlib/tests/test_backend_pdf.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,3 +478,221 @@ def test_font_bitstream_charter():
478478
ax.text(0.1, 0.3, r"fi ffl 1234", usetex=True, fontsize=50)
479479
ax.set_xticks([])
480480
ax.set_yticks([])
481+
482+
483+
def test_scatter_offaxis_colored_pdf_size():
484+
"""
485+
Test that off-axis scatter plots with per-point colors don't bloat PDFs.
486+
487+
Regression test for issue #2488. When scatter points with per-point colors
488+
are completely outside the visible axes, the PDF backend should skip
489+
writing those markers to significantly reduce file size.
490+
"""
491+
# Use John Hunter's birthday as random seed for reproducibility
492+
rng = np.random.default_rng(19680801)
493+
494+
n_points = 1000
495+
x = rng.random(n_points) * 10
496+
y = rng.random(n_points) * 10
497+
c = rng.random(n_points)
498+
499+
# Test 1: Scatter with per-point colors, all points OFF-AXIS
500+
fig1, ax1 = plt.subplots()
501+
ax1.scatter(x, y, c=c)
502+
ax1.set_xlim(20, 30) # Move view completely away from data (x is 0-10)
503+
ax1.set_ylim(20, 30) # Move view completely away from data (y is 0-10)
504+
505+
buf1 = io.BytesIO()
506+
fig1.savefig(buf1, format='pdf')
507+
size_offaxis_colored = buf1.tell()
508+
plt.close(fig1)
509+
510+
# Test 2: Empty scatter (baseline - accounts for scatter call overhead)
511+
fig2, ax2 = plt.subplots()
512+
ax2.scatter([], []) # Empty scatter to match the axes structure
513+
ax2.set_xlim(20, 30)
514+
ax2.set_ylim(20, 30)
515+
516+
buf2 = io.BytesIO()
517+
fig2.savefig(buf2, format='pdf')
518+
size_empty = buf2.tell()
519+
plt.close(fig2)
520+
521+
# Test 3: Scatter with visible markers (should be much larger)
522+
fig3, ax3 = plt.subplots()
523+
ax3.scatter(x + 20, y + 20, c=c) # Shift points to be visible
524+
ax3.set_xlim(20, 30)
525+
ax3.set_ylim(20, 30)
526+
527+
buf3 = io.BytesIO()
528+
fig3.savefig(buf3, format='pdf')
529+
size_visible = buf3.tell()
530+
plt.close(fig3)
531+
532+
# The off-axis colored scatter should be close to empty size.
533+
# Since the axes are identical, the difference should be minimal
534+
# (just the scatter collection setup, no actual marker data).
535+
# Use a tight tolerance since axes output is identical.
536+
assert size_offaxis_colored < size_empty + 5_000, (
537+
f"Off-axis colored scatter PDF ({size_offaxis_colored} bytes) is too large. "
538+
f"Expected close to empty scatter size ({size_empty} bytes). "
539+
f"Markers may not be properly skipped."
540+
)
541+
542+
# The visible scatter should be significantly larger than both empty and
543+
# off-axis, demonstrating the optimization is working.
544+
assert size_visible > size_empty + 15_000, (
545+
f"Visible scatter PDF ({size_visible} bytes) should be much larger "
546+
f"than empty ({size_empty} bytes) to validate the test."
547+
)
548+
assert size_visible > size_offaxis_colored + 15_000, (
549+
f"Visible scatter PDF ({size_visible} bytes) should be much larger "
550+
f"than off-axis ({size_offaxis_colored} bytes) to validate optimization."
551+
)
552+
553+
554+
@check_figures_equal(extensions=["pdf"])
555+
def test_scatter_offaxis_colored_visual(fig_test, fig_ref):
556+
"""
557+
Test that on-axis scatter with per-point colors still renders correctly.
558+
559+
Ensures the optimization for off-axis markers doesn't break normal
560+
scatter rendering.
561+
"""
562+
rng = np.random.default_rng(19680801)
563+
564+
n_points = 100
565+
x = rng.random(n_points) * 5
566+
y = rng.random(n_points) * 5
567+
c = rng.random(n_points)
568+
569+
# Test figure: scatter with clipping optimization
570+
ax_test = fig_test.subplots()
571+
ax_test.scatter(x, y, c=c, s=50)
572+
ax_test.set_xlim(0, 10)
573+
ax_test.set_ylim(0, 10)
574+
575+
# Reference figure: should look identical
576+
ax_ref = fig_ref.subplots()
577+
ax_ref.scatter(x, y, c=c, s=50)
578+
ax_ref.set_xlim(0, 10)
579+
ax_ref.set_ylim(0, 10)
580+
581+
582+
@check_figures_equal(extensions=["pdf"])
583+
def test_scatter_mixed_onoff_axis(fig_test, fig_ref):
584+
"""
585+
Test scatter with some points on-axis and some off-axis.
586+
587+
Ensures the optimization correctly handles the common case where only
588+
some markers are outside the visible area.
589+
"""
590+
rng = np.random.default_rng(19680801)
591+
592+
# Create points: half on-axis (0-5), half off-axis (15-20)
593+
n_points = 50
594+
x_on = rng.random(n_points) * 5
595+
y_on = rng.random(n_points) * 5
596+
x_off = rng.random(n_points) * 5 + 15
597+
y_off = rng.random(n_points) * 5 + 15
598+
599+
x = np.concatenate([x_on, x_off])
600+
y = np.concatenate([y_on, y_off])
601+
c = rng.random(2 * n_points)
602+
603+
# Test figure: scatter with mixed points
604+
ax_test = fig_test.subplots()
605+
ax_test.scatter(x, y, c=c, s=50)
606+
ax_test.set_xlim(0, 10)
607+
ax_test.set_ylim(0, 10)
608+
609+
# Reference figure: only the on-axis points should be visible
610+
ax_ref = fig_ref.subplots()
611+
ax_ref.scatter(x_on, y_on, c=c[:n_points], s=50)
612+
ax_ref.set_xlim(0, 10)
613+
ax_ref.set_ylim(0, 10)
614+
615+
616+
@check_figures_equal(extensions=["pdf"])
617+
def test_scatter_large_markers_partial_clip(fig_test, fig_ref):
618+
"""
619+
Test that large markers are rendered when partially visible.
620+
621+
Addresses reviewer concern: markers with centers outside the canvas but
622+
with edges extending into the visible area should still be rendered.
623+
"""
624+
# Create markers just outside the visible area
625+
# Canvas is 0-10, markers at x=-0.5 and x=10.5
626+
x = np.array([-0.5, 10.5, 5]) # left edge, right edge, center
627+
y = np.array([5, 5, -0.5]) # center, center, bottom edge
628+
c = np.array([0.2, 0.5, 0.8])
629+
630+
# Test figure: large markers (s=500 ≈ 11 points radius)
631+
# Centers are outside, but marker edges extend into visible area
632+
ax_test = fig_test.subplots()
633+
ax_test.scatter(x, y, c=c, s=500)
634+
ax_test.set_xlim(0, 10)
635+
ax_test.set_ylim(0, 10)
636+
637+
# Reference figure: same plot (should render identically)
638+
ax_ref = fig_ref.subplots()
639+
ax_ref.scatter(x, y, c=c, s=500)
640+
ax_ref.set_xlim(0, 10)
641+
ax_ref.set_ylim(0, 10)
642+
643+
644+
@check_figures_equal(extensions=["pdf"])
645+
def test_scatter_logscale(fig_test, fig_ref):
646+
"""
647+
Test scatter optimization with logarithmic scales.
648+
649+
Ensures bounds checking works correctly in log-transformed coordinates.
650+
"""
651+
rng = np.random.default_rng(19680801)
652+
653+
# Create points across several orders of magnitude
654+
n_points = 50
655+
x = 10 ** (rng.random(n_points) * 4) # 1 to 10000
656+
y = 10 ** (rng.random(n_points) * 4)
657+
c = rng.random(n_points)
658+
659+
# Test figure: log scale with points mostly outside view
660+
ax_test = fig_test.subplots()
661+
ax_test.scatter(x, y, c=c, s=50)
662+
ax_test.set_xscale('log')
663+
ax_test.set_yscale('log')
664+
ax_test.set_xlim(100, 1000) # Only show middle range
665+
ax_test.set_ylim(100, 1000)
666+
667+
# Reference figure: should render identically
668+
ax_ref = fig_ref.subplots()
669+
ax_ref.scatter(x, y, c=c, s=50)
670+
ax_ref.set_xscale('log')
671+
ax_ref.set_yscale('log')
672+
ax_ref.set_xlim(100, 1000)
673+
ax_ref.set_ylim(100, 1000)
674+
675+
676+
@check_figures_equal(extensions=["pdf"])
677+
def test_scatter_polar(fig_test, fig_ref):
678+
"""
679+
Test scatter optimization with polar coordinates.
680+
681+
Ensures bounds checking works correctly in polar projections.
682+
"""
683+
rng = np.random.default_rng(19680801)
684+
685+
n_points = 50
686+
theta = rng.random(n_points) * 2 * np.pi
687+
r = rng.random(n_points) * 3
688+
c = rng.random(n_points)
689+
690+
# Test figure: polar projection
691+
ax_test = fig_test.subplots(subplot_kw={'projection': 'polar'})
692+
ax_test.scatter(theta, r, c=c, s=50)
693+
ax_test.set_ylim(0, 2) # Limit radial range
694+
695+
# Reference figure: should render identically
696+
ax_ref = fig_ref.subplots(subplot_kw={'projection': 'polar'})
697+
ax_ref.scatter(theta, r, c=c, s=50)
698+
ax_ref.set_ylim(0, 2)

0 commit comments

Comments
 (0)