fix(pypdfium): resolve overlapping text when merging bounding boxes (#1549)

get merged_text from boundingbox instead of merging it to prevent overlaps

Signed-off-by: Pedro Ribeiro <pedro_ribeiro_93@hotmail.com>
This commit is contained in:
Pedro Ribeiro
2025-05-19 14:26:00 +01:00
committed by GitHub
parent 12a0e64892
commit 98b5eeb844
52 changed files with 52225 additions and 4690 deletions

View File

@@ -8589,7 +8589,7 @@
{
"page_no": 1,
"bbox": {
"l": 33.09040069580078,
"l": 33.09052658081055,
"t": 498.9671630859375,
"r": 585.1502075195312,
"b": 89.5469970703125,
@@ -8683,9 +8683,9 @@
{
"page_no": 3,
"bbox": {
"l": 64.1669921875,
"l": 64.16704559326172,
"t": 188.49365234375,
"r": 258.7742919921875,
"r": 258.77435302734375,
"b": 103.87176513671875,
"coord_origin": "BOTTOMLEFT"
},
@@ -8743,7 +8743,7 @@
{
"page_no": 4,
"bbox": {
"l": 145.4144744873047,
"l": 145.41448974609375,
"t": 264.7552490234375,
"r": 252.08840942382812,
"b": 156.616943359375,
@@ -8773,10 +8773,10 @@
{
"page_no": 5,
"bbox": {
"l": 32.075252532958984,
"t": 721.4226226806641,
"l": 32.075260162353516,
"t": 721.4226608276367,
"r": 239.620361328125,
"b": 554.0420684814453,
"b": 554.0421142578125,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@@ -8996,7 +8996,7 @@
"page_no": 10,
"bbox": {
"l": 135.97177124023438,
"t": 684.5892486572266,
"t": 684.5892562866211,
"r": 545.4180908203125,
"b": 381.39068603515625,
"coord_origin": "BOTTOMLEFT"
@@ -9063,9 +9063,9 @@
{
"page_no": 11,
"bbox": {
"l": 135.64837646484375,
"t": 407.8262939453125,
"r": 301.2367248535156,
"l": 135.64834594726562,
"t": 407.8263244628906,
"r": 301.23675537109375,
"b": 197.24334716796875,
"coord_origin": "BOTTOMLEFT"
},
@@ -9101,10 +9101,10 @@
{
"page_no": 14,
"bbox": {
"l": 63.801902770996094,
"t": 696.6175842285156,
"r": 547.11474609375,
"b": 621.9678497314453,
"l": 63.80195617675781,
"t": 696.6176071166992,
"r": 547.1146850585938,
"b": 621.9679107666016,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@@ -9139,7 +9139,7 @@
{
"page_no": 14,
"bbox": {
"l": 63.985130310058594,
"l": 63.9850959777832,
"t": 364.09503173828125,
"r": 530.0478515625,
"b": 145.8603515625,
@@ -9178,9 +9178,9 @@
"page_no": 15,
"bbox": {
"l": 136.5016632080078,
"t": 672.7508773803711,
"t": 672.7509078979492,
"r": 545.4508666992188,
"b": 314.4587707519531,
"b": 314.45880126953125,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@@ -9322,10 +9322,10 @@
{
"page_no": 2,
"bbox": {
"l": 136.1496124267578,
"t": 659.9669647216797,
"r": 547.5267944335938,
"b": 76.34844970703125,
"l": 136.1495819091797,
"t": 659.9669189453125,
"r": 547.52685546875,
"b": 76.3485107421875,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@@ -12490,9 +12490,9 @@
"page_no": 8,
"bbox": {
"l": 135.52462768554688,
"t": 502.2747802734375,
"t": 502.2746887207031,
"r": 545.8714599609375,
"b": 349.949462890625,
"b": 349.94940185546875,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@@ -13115,7 +13115,7 @@
"page_no": 9,
"bbox": {
"l": 64.41139221191406,
"t": 398.3863830566406,
"t": 398.3863525390625,
"r": 547.3950805664062,
"b": 70.39208984375,
"coord_origin": "BOTTOMLEFT"
@@ -15731,7 +15731,7 @@
{
"page_no": 12,
"bbox": {
"l": 63.55636978149414,
"l": 63.55635070800781,
"t": 687.7661285400391,
"r": 548.5687255859375,
"b": 495.77532958984375,