807 lines
31 KiB
Python
807 lines
31 KiB
Python
import copy
|
|
import logging
|
|
|
|
import networkx as nx
|
|
|
|
logger = logging.getLogger("layout_utils")
|
|
|
|
|
|
## -------------------------------
|
|
## Geometric helper functions
|
|
## The coordinates grow left to right, and bottom to top.
|
|
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
|
|
|
|
|
|
def area(bbox):
|
|
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
|
|
|
|
|
def contains(bbox_i, bbox_j):
|
|
## Returns True if bbox_i contains bbox_j, else False
|
|
return (
|
|
bbox_i[0] <= bbox_j[0]
|
|
and bbox_i[1] <= bbox_j[1]
|
|
and bbox_i[2] >= bbox_j[2]
|
|
and bbox_i[3] >= bbox_j[3]
|
|
)
|
|
|
|
|
|
def is_intersecting(bbox_i, bbox_j):
|
|
return not (
|
|
bbox_i[2] < bbox_j[0]
|
|
or bbox_i[0] > bbox_j[2]
|
|
or bbox_i[3] < bbox_j[1]
|
|
or bbox_i[1] > bbox_j[3]
|
|
)
|
|
|
|
|
|
def bb_iou(boxA, boxB):
|
|
# determine the (x, y)-coordinates of the intersection rectangle
|
|
xA = max(boxA[0], boxB[0])
|
|
yA = max(boxA[1], boxB[1])
|
|
xB = min(boxA[2], boxB[2])
|
|
yB = min(boxA[3], boxB[3])
|
|
# compute the area of intersection rectangle
|
|
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
|
|
# compute the area of both the prediction and ground-truth
|
|
# rectangles
|
|
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
|
|
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
|
|
# compute the intersection over union by taking the intersection
|
|
# area and dividing it by the sum of prediction + ground-truth
|
|
# areas - the interesection area
|
|
iou = interArea / float(boxAArea + boxBArea - interArea)
|
|
# return the intersection over union value
|
|
return iou
|
|
|
|
|
|
def compute_intersection(bbox_i, bbox_j):
|
|
## Returns the size of the intersection area of the two boxes
|
|
if not is_intersecting(bbox_i, bbox_j):
|
|
return 0
|
|
## Determine the (x, y)-coordinates of the intersection rectangle:
|
|
xA = max(bbox_i[0], bbox_j[0])
|
|
yA = max(bbox_i[1], bbox_j[1])
|
|
xB = min(bbox_i[2], bbox_j[2])
|
|
yB = min(bbox_i[3], bbox_j[3])
|
|
## Compute the area of intersection rectangle:
|
|
interArea = (xB - xA) * (yB - yA)
|
|
if interArea < 0:
|
|
logger.debug("Warning: Negative intersection detected!")
|
|
return 0
|
|
return interArea
|
|
|
|
|
|
def surrounding(bbox_i, bbox_j):
|
|
## Computes minimal box that contains both input boxes
|
|
sbox = []
|
|
sbox.append(min(bbox_i[0], bbox_j[0]))
|
|
sbox.append(min(bbox_i[1], bbox_j[1]))
|
|
sbox.append(max(bbox_i[2], bbox_j[2]))
|
|
sbox.append(max(bbox_i[3], bbox_j[3]))
|
|
return sbox
|
|
|
|
|
|
def surrounding_list(bbox_list):
|
|
## Computes minimal box that contains all boxes in the input list
|
|
## The list should be non-empty, but just in case it's not:
|
|
if len(bbox_list) == 0:
|
|
sbox = [0, 0, 0, 0]
|
|
else:
|
|
sbox = []
|
|
sbox.append(min([bbox[0] for bbox in bbox_list]))
|
|
sbox.append(min([bbox[1] for bbox in bbox_list]))
|
|
sbox.append(max([bbox[2] for bbox in bbox_list]))
|
|
sbox.append(max([bbox[3] for bbox in bbox_list]))
|
|
return sbox
|
|
|
|
|
|
def vertical_overlap(bboxA, bboxB):
|
|
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
|
if bboxB[3] < bboxA[1]: ## B below A
|
|
return False
|
|
elif bboxA[3] < bboxB[1]: ## A below B
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
|
|
def vertical_overlap_fraction(bboxA, bboxB):
|
|
## Returns the vertical overlap as fraction of the lower bbox height.
|
|
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
|
## Height 0 is permitted in the input.
|
|
heightA = bboxA[3] - bboxA[1]
|
|
heightB = bboxB[3] - bboxB[1]
|
|
min_height = min(heightA, heightB)
|
|
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
|
|
if (
|
|
bboxA[1] <= bboxB[1]
|
|
): ## B is completely in A; this can include height of B = 0:
|
|
fraction = 1
|
|
else:
|
|
overlap = max(bboxB[3] - bboxA[1], 0)
|
|
fraction = overlap / max(min_height, 0.001)
|
|
else:
|
|
if (
|
|
bboxB[1] <= bboxA[1]
|
|
): ## A is completely in B; this can include height of A = 0:
|
|
fraction = 1
|
|
else:
|
|
overlap = max(bboxA[3] - bboxB[1], 0)
|
|
fraction = overlap / max(min_height, 0.001)
|
|
return fraction
|
|
|
|
|
|
## -------------------------------
|
|
## Cluster-and-cell relations
|
|
|
|
|
|
def compute_enclosed_cells(
|
|
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
|
|
):
|
|
cells_in_cluster = []
|
|
cells_in_cluster_int = []
|
|
for ix, cell in enumerate(raw_cells):
|
|
cell_bbox = cell["bbox"]
|
|
intersection = compute_intersection(cell_bbox, cluster_bbox)
|
|
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
|
|
|
|
if (
|
|
intersection > frac_area and frac_area > 0
|
|
): # intersect > certain fraction of cell
|
|
cells_in_cluster.append(ix)
|
|
cells_in_cluster_int.append(intersection)
|
|
elif contains(
|
|
cluster_bbox,
|
|
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
|
|
):
|
|
cells_in_cluster.append(ix)
|
|
return cells_in_cluster, cells_in_cluster_int
|
|
|
|
|
|
def find_clusters_around_cells(cell_count, clusters):
|
|
## Per raw cell, find to which clusters it belongs.
|
|
## Return list of these indices in the raw-cell order.
|
|
clusters_around_cells = [[] for _ in range(cell_count)]
|
|
for cl_ix, cluster in enumerate(clusters):
|
|
for ix in cluster["cell_ids"]:
|
|
clusters_around_cells[ix].append(cl_ix)
|
|
return clusters_around_cells
|
|
|
|
|
|
def find_cell_index(raw_ix, cell_array):
|
|
## "raw_ix" is a rawcell_id.
|
|
## "cell_array" has the structure of an (annotation) cells array.
|
|
## Returns index of cell in cell_array that has this rawcell_id.
|
|
for ix, cell in enumerate(cell_array):
|
|
if cell["rawcell_id"] == raw_ix:
|
|
return ix
|
|
|
|
|
|
def find_cell_indices(cluster, cell_array):
|
|
## "cluster" must have the structure as in a clusters array in a prediction,
|
|
## "cell_array" that of a cells array.
|
|
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
|
|
## in the order of the rawcell_ids.
|
|
result = []
|
|
for raw_ix in sorted(cluster["cell_ids"]):
|
|
## Find the cell with this rawcell_id (if any)
|
|
for ix, cell in enumerate(cell_array):
|
|
if cell["rawcell_id"] == raw_ix:
|
|
result.append(ix)
|
|
return result
|
|
|
|
|
|
def find_first_cell_index(cluster, cell_array):
|
|
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
|
|
## "cell_array" has the structure of a cells array in an annotation.
|
|
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
|
|
result = [] ## We keep it a list as it can be empty (picture without text cells)
|
|
if len(cluster["cell_ids"]) == 0:
|
|
return result
|
|
raw_ix = min(cluster["cell_ids"])
|
|
## Find the cell with this rawcell_id (if any)
|
|
for ix, cell in enumerate(cell_array):
|
|
if cell["rawcell_id"] == raw_ix:
|
|
result.append(ix)
|
|
break ## One is enough; should be only one anyway.
|
|
if result == []:
|
|
logger.debug(
|
|
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
|
|
)
|
|
return result
|
|
|
|
|
|
## -------------------------------
|
|
## Cluster labels and text
|
|
|
|
|
|
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
|
|
## "cluster" must have the structure as in a clusters array in a prediction,
|
|
## "cl_ix" is its index in target_pred,
|
|
## "new_label" is the intended new label,
|
|
## "target_pred" is the entire current target prediction.
|
|
## Sets label on the cluster itself, and on the cells in the target_pred.
|
|
## Returns new_label so that also the cl_label variable in the main code is easily set.
|
|
target_pred["clusters"][cl_ix]["type"] = new_label
|
|
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
|
|
for ix in cluster_target_cells:
|
|
target_pred["cells"][ix]["label"] = new_label
|
|
return new_label
|
|
|
|
|
|
def find_cluster_text(cluster, raw_cells):
|
|
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
|
## "raw_cells" must have the format of item["raw"]["cells"]
|
|
## Returns the text of the cluster, with blanks between the cell contents
|
|
## (which seem to be words or phrases without starting or trailing blanks).
|
|
## Note that in formulas, this may give a lot more blanks than originally
|
|
cluster_text = ""
|
|
for raw_ix in sorted(cluster["cell_ids"]):
|
|
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
|
|
return cluster_text.rstrip()
|
|
|
|
|
|
def find_cluster_text_without_blanks(cluster, raw_cells):
|
|
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
|
## "raw_cells" must have the format of item["raw"]["cells"]
|
|
## Returns the text of the cluster, without blanks between the cell contents
|
|
## Interesting in formula analysis.
|
|
cluster_text = ""
|
|
for raw_ix in sorted(cluster["cell_ids"]):
|
|
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
|
|
return cluster_text.rstrip()
|
|
|
|
|
|
## -------------------------------
|
|
## Clusters and lines
|
|
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
|
|
## but this one also in FormulaAnalysis)
|
|
|
|
|
|
def build_cluster_from_lines(lines, label, id):
|
|
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
|
|
## (There is no condition that they are really geometrically lines)
|
|
## A cluster in standard format is returned with given label and id
|
|
local_lines = copy.deepcopy(
|
|
lines
|
|
) ## without this, it changes "lines" also outside this function
|
|
first_line = local_lines.pop(0)
|
|
cluster = {
|
|
"id": id,
|
|
"type": label,
|
|
"cell_ids": first_line["cell_ids"],
|
|
"bbox": first_line["bbox"],
|
|
"confidence": 0,
|
|
"created_by": "merged_cells",
|
|
}
|
|
confidence = 0
|
|
counter = 0
|
|
for line in local_lines:
|
|
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
|
|
cluster["cell_ids"] = new_cell_ids
|
|
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
|
|
counter += 1
|
|
confidence += line["confidence"]
|
|
confidence = confidence / counter
|
|
cluster["confidence"] = confidence
|
|
return cluster
|
|
|
|
|
|
## -------------------------------
|
|
## Reading order
|
|
|
|
|
|
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
|
|
## In:
|
|
## Clusters: list as in predictions.
|
|
## cluster_sort_type: string, currently only "raw_cells".
|
|
## cell_sort_type: string, currently only "raw_cells".
|
|
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
|
|
## Out: Another clusters list, sorted according to the type.
|
|
|
|
logger.debug("---- Start cluster sorting ------")
|
|
|
|
if cell_sort_type == "raw_cell_ids":
|
|
for cl in clusters:
|
|
sorted_cell_ids = sorted(cl["cell_ids"])
|
|
cl["cell_ids"] = sorted_cell_ids
|
|
else:
|
|
logger.debug(
|
|
"Unknown cell_sort_type `"
|
|
+ cell_sort_type
|
|
+ "`, no cell sorting will happen."
|
|
)
|
|
|
|
if cluster_sort_type == "raw_cell_ids":
|
|
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
|
|
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
|
|
logger.debug(
|
|
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
|
|
)
|
|
logger.debug(
|
|
" Their first cell ids: "
|
|
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
|
|
)
|
|
logger.debug(
|
|
"Clusters without cells: "
|
|
+ str([cl["id"] for cl in clusters_without_cells])
|
|
)
|
|
clusters_with_cells_sorted = sorted(
|
|
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
|
|
)
|
|
logger.debug(
|
|
" First cell ids after sorting: "
|
|
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
|
|
)
|
|
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
|
|
else:
|
|
logger.debug(
|
|
"Unknown cluster_sort_type: `"
|
|
+ cluster_sort_type
|
|
+ "`, no cluster sorting will happen."
|
|
)
|
|
|
|
if sort_ids:
|
|
for i, cl in enumerate(sorted_clusters):
|
|
cl["id"] = i
|
|
return sorted_clusters
|
|
|
|
|
|
## -------------------------------
|
|
## Line Splitting
|
|
|
|
|
|
def sort_cells_horizontal(line_cell_ids, raw_cells):
|
|
## "line_cells" should be a non-empty list of (raw) cell_ids
|
|
## "raw_cells" has the structure of item["raw"]["cells"].
|
|
## Sorts the cells in the line by x0 (left start).
|
|
new_line_cell_ids = sorted(
|
|
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
|
|
)
|
|
return new_line_cell_ids
|
|
|
|
|
|
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
|
new_clusters = []
|
|
for ix, cluster in enumerate(clusters):
|
|
new_cluster = copy.deepcopy(cluster)
|
|
logger.debug(
|
|
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
|
)
|
|
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
|
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
|
|
logger.debug(" Empty non-picture, removed")
|
|
continue ## Skip this former cluster, now without cells.
|
|
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
|
new_cluster["bbox"] = new_bbox
|
|
new_clusters.append(new_cluster)
|
|
return new_clusters
|
|
|
|
|
|
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
|
if not (cluster["type"] in ["Table", "Picture"]):
|
|
## A text-like cluster. The bbox only needs to be around the text cells:
|
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
|
new_bbox = surrounding_list(
|
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
|
)
|
|
logger.debug(" New bounding box:" + str(new_bbox))
|
|
if cluster["type"] == "Picture":
|
|
## We only make the bbox completely comprise included text cells:
|
|
logger.debug(" Picture")
|
|
if len(cluster["cell_ids"]) != 0:
|
|
min_bbox = surrounding_list(
|
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
|
)
|
|
logger.debug(" Minimum bbox: " + str(min_bbox))
|
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
|
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
|
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
|
|
else:
|
|
logger.debug(" without text cells, no change.")
|
|
new_bbox = cluster["bbox"]
|
|
else: ## A table
|
|
## At least we have to keep the included text cells, and we make the bbox completely comprise them
|
|
min_bbox = surrounding_list(
|
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
|
)
|
|
logger.debug(" Minimum bbox: " + str(min_bbox))
|
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
|
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
|
logger.debug(" Possibly increased bbox: " + str(new_bbox))
|
|
|
|
## Now we look which non-belonging cells are covered.
|
|
## (To decrease dependencies, we don't make use of which cells we actually removed.)
|
|
## We don't worry about orphan cells, those could still be added to the table.
|
|
enclosed_cells = compute_enclosed_cells(
|
|
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
|
|
)[0]
|
|
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
|
|
logger.debug(
|
|
" Additional cells enclosed by Table bbox: " + str(additional_cells)
|
|
)
|
|
spurious_cells = additional_cells - set(orphan_cell_indices)
|
|
logger.debug(
|
|
" Spurious cells enclosed by Table bbox (additional minus orphans): "
|
|
+ str(spurious_cells)
|
|
)
|
|
if len(spurious_cells) == 0:
|
|
return new_bbox
|
|
|
|
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
|
|
## We initialize possible cuts with the current bbox.
|
|
left_cut = new_bbox[0]
|
|
right_cut = new_bbox[2]
|
|
upper_cut = new_bbox[3]
|
|
lower_cut = new_bbox[1]
|
|
|
|
for cell_ix in spurious_cells:
|
|
cell = raw_cells[cell_ix]
|
|
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
|
|
is_left = cell["bbox"][2] < min_bbox[0]
|
|
is_right = cell["bbox"][0] > min_bbox[2]
|
|
is_above = cell["bbox"][1] > min_bbox[3]
|
|
is_below = cell["bbox"][3] < min_bbox[1]
|
|
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
|
|
|
|
if is_left:
|
|
if cell["bbox"][2] > left_cut:
|
|
## We move the left cut to exclude this cell:
|
|
left_cut = cell["bbox"][2]
|
|
if is_right:
|
|
if cell["bbox"][0] < right_cut:
|
|
## We move the right cut to exclude this cell:
|
|
right_cut = cell["bbox"][0]
|
|
if is_above:
|
|
if cell["bbox"][1] < upper_cut:
|
|
## We move the upper cut to exclude this cell:
|
|
upper_cut = cell["bbox"][1]
|
|
if is_below:
|
|
if cell["bbox"][3] > lower_cut:
|
|
## We move the left cut to exclude this cell:
|
|
lower_cut = cell["bbox"][3]
|
|
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
|
|
|
|
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
|
|
|
|
logger.debug(" Final bbox: " + str(new_bbox))
|
|
return new_bbox
|
|
|
|
|
|
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
|
|
DuplicateDeletedClusterIDs = []
|
|
for cluster_1 in cluster_predictions:
|
|
for cluster_2 in cluster_predictions:
|
|
if cluster_1["id"] != cluster_2["id"]:
|
|
if_conf = False
|
|
if cluster_1["confidence"] > cluster_2["confidence"]:
|
|
if_conf = True
|
|
if if_conf == True:
|
|
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
|
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
|
elif contains(
|
|
cluster_1["bbox"],
|
|
[
|
|
cluster_2["bbox"][0] + 3,
|
|
cluster_2["bbox"][1] + 3,
|
|
cluster_2["bbox"][2] - 3,
|
|
cluster_2["bbox"][3] - 3,
|
|
],
|
|
):
|
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
|
|
|
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
|
|
|
for cl_id in DuplicateDeletedClusterIDs:
|
|
for cluster in cluster_predictions:
|
|
if cl_id == cluster["id"]:
|
|
cluster_predictions.remove(cluster)
|
|
return cluster_predictions
|
|
|
|
|
|
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
|
|
def assign_orphans_with_low_conf_pred(
|
|
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
|
):
|
|
for orph_id in orphan_cell_indices:
|
|
cluster_chosen = {}
|
|
iou_thresh = 0.05
|
|
confidence = 0.05
|
|
|
|
# Loop over all predictions, and find the one with the highest IOU, and confidence
|
|
for cluster in cluster_predictions_low:
|
|
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
|
|
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
|
|
cluster["bbox"][2] - cluster["bbox"][0]
|
|
)
|
|
cell_area = (
|
|
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
|
|
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
|
|
|
|
if (
|
|
(iou_thresh < calc_iou)
|
|
and (cluster["confidence"] > confidence)
|
|
and (cell_area * 3 > cluster_area)
|
|
):
|
|
cluster_chosen = cluster
|
|
iou_thresh = calc_iou
|
|
confidence = cluster["confidence"]
|
|
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
|
|
if iou_thresh != 0.05 and confidence != 0.05:
|
|
cluster_chosen["cell_ids"].append(orph_id)
|
|
cluster_chosen["created_by"] = "orph_low_conf"
|
|
cluster_predictions.append(cluster_chosen)
|
|
orphan_cell_indices.remove(orph_id)
|
|
return cluster_predictions, orphan_cell_indices
|
|
|
|
|
|
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
|
|
for amb_cell_id in amb_cell_idxs:
|
|
highest_conf = 0
|
|
highest_bbox_iou = 0
|
|
cluster_chosen = None
|
|
problamatic_clusters = []
|
|
|
|
# Find clusters in question
|
|
for cluster in cluster_predictions:
|
|
|
|
if amb_cell_id in cluster["cell_ids"]:
|
|
problamatic_clusters.append(amb_cell_id)
|
|
|
|
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
|
|
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
|
|
|
|
if (
|
|
cluster["confidence"] > highest_conf
|
|
and bbox_iou_val > highest_bbox_iou
|
|
):
|
|
cluster_chosen = cluster
|
|
highest_conf = cluster["confidence"]
|
|
highest_bbox_iou = bbox_iou_val
|
|
if cluster["id"] in problamatic_clusters:
|
|
problamatic_clusters.remove(cluster["id"])
|
|
|
|
# now remove the assigning of cell id from lower confidence, and threshold
|
|
for cluster in cluster_predictions:
|
|
for prob_amb_id in problamatic_clusters:
|
|
if prob_amb_id in cluster["cell_ids"]:
|
|
cluster["cell_ids"].remove(prob_amb_id)
|
|
amb_cell_idxs.remove(amb_cell_id)
|
|
|
|
return cluster_predictions, amb_cell_idxs
|
|
|
|
|
|
def ranges(nums):
|
|
# Find if consecutive numbers exist within pdf cells
|
|
# Used to remove line numbers for review manuscripts
|
|
nums = sorted(set(nums))
|
|
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
|
|
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
|
|
return list(zip(edges, edges))
|
|
|
|
|
|
def set_orphan_as_text(
|
|
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
|
):
|
|
max_id = -1
|
|
figures = []
|
|
for cluster in cluster_predictions:
|
|
if cluster["type"] == "Picture":
|
|
figures.append(cluster)
|
|
|
|
if cluster["id"] > max_id:
|
|
max_id = cluster["id"]
|
|
max_id += 1
|
|
|
|
lines_detector = False
|
|
content_of_orphans = []
|
|
for orph_id in orphan_cell_indices:
|
|
orph_cell = raw_cells[orph_id]
|
|
content_of_orphans.append(raw_cells[orph_id]["text"])
|
|
|
|
fil_content_of_orphans = []
|
|
for cell_content in content_of_orphans:
|
|
if cell_content.isnumeric():
|
|
try:
|
|
num = int(cell_content)
|
|
fil_content_of_orphans.append(num)
|
|
except ValueError: # ignore the cell
|
|
pass
|
|
|
|
# line_orphans = []
|
|
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
|
|
# then check between the orphan cells if they are numeric
|
|
# and if they are a consecutive series of numbers (using ranges function) to decide
|
|
|
|
if len(fil_content_of_orphans) > 2:
|
|
out_ranges = ranges(fil_content_of_orphans)
|
|
if len(out_ranges) > 1:
|
|
cnt_range = 0
|
|
for ranges_ in out_ranges:
|
|
if ranges_[0] != ranges_[1]:
|
|
# If there are more than 75 (half the total line number of a review manuscript page)
|
|
# decide that there are line numbers on page to be ignored.
|
|
if len(list(range(ranges_[0], ranges_[1]))) > 75:
|
|
lines_detector = True
|
|
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
|
|
|
|
for orph_id in orphan_cell_indices:
|
|
orph_cell = raw_cells[orph_id]
|
|
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
|
|
fig_flag = False
|
|
# Do not assign orphan cells if they are inside a figure
|
|
for fig in figures:
|
|
if contains(fig["bbox"], orph_cell["bbox"]):
|
|
fig_flag = True
|
|
|
|
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
|
if fig_flag == False and lines_detector == False:
|
|
# get class from low confidence detections if not set as text:
|
|
class_type = "Text"
|
|
|
|
for cluster in cluster_predictions_low:
|
|
intersection = compute_intersection(
|
|
orph_cell["bbox"], cluster["bbox"]
|
|
)
|
|
class_type = "Text"
|
|
if (
|
|
cluster["confidence"] > 0.1
|
|
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
|
):
|
|
class_type = cluster["type"]
|
|
elif contains(
|
|
cluster["bbox"],
|
|
[
|
|
orph_cell["bbox"][0] + 3,
|
|
orph_cell["bbox"][1] + 3,
|
|
orph_cell["bbox"][2] - 3,
|
|
orph_cell["bbox"][3] - 3,
|
|
],
|
|
):
|
|
class_type = cluster["type"]
|
|
elif intersection > area(orph_cell["bbox"]) * 0.2:
|
|
class_type = cluster["type"]
|
|
|
|
new_cluster = {
|
|
"id": max_id,
|
|
"bbox": orph_cell["bbox"],
|
|
"type": class_type,
|
|
"cell_ids": [orph_id],
|
|
"confidence": -1,
|
|
"created_by": "orphan_default",
|
|
}
|
|
max_id += 1
|
|
cluster_predictions.append(new_cluster)
|
|
return cluster_predictions, orphan_cell_indices
|
|
|
|
|
|
def merge_cells(cluster_predictions):
|
|
# Using graph component creates clusters if orphan cells are touching or too close.
|
|
G = nx.Graph()
|
|
for cluster in cluster_predictions:
|
|
if cluster["created_by"] == "orphan_default":
|
|
G.add_node(cluster["id"])
|
|
|
|
for cluster_1 in cluster_predictions:
|
|
for cluster_2 in cluster_predictions:
|
|
if (
|
|
cluster_1["id"] != cluster_2["id"]
|
|
and cluster_2["created_by"] == "orphan_default"
|
|
and cluster_1["created_by"] == "orphan_default"
|
|
):
|
|
cl1 = copy.deepcopy(cluster_1["bbox"])
|
|
cl2 = copy.deepcopy(cluster_2["bbox"])
|
|
cl1[0] = cl1[0] - 2
|
|
cl1[1] = cl1[1] - 2
|
|
cl1[2] = cl1[2] + 2
|
|
cl1[3] = cl1[3] + 2
|
|
cl2[0] = cl2[0] - 2
|
|
cl2[1] = cl2[1] - 2
|
|
cl2[2] = cl2[2] + 2
|
|
cl2[3] = cl2[3] + 2
|
|
if is_intersecting(cl1, cl2):
|
|
G.add_edge(cluster_1["id"], cluster_2["id"])
|
|
|
|
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
|
|
max_id = -1
|
|
for cluster_1 in cluster_predictions:
|
|
if cluster_1["id"] > max_id:
|
|
max_id = cluster_1["id"]
|
|
|
|
for nodes in component:
|
|
if len(nodes) > 1:
|
|
max_id += 1
|
|
lines = []
|
|
for node in nodes:
|
|
for cluster in cluster_predictions:
|
|
if cluster["id"] == node:
|
|
lines.append(cluster)
|
|
cluster_predictions.remove(cluster)
|
|
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
|
|
cluster_predictions.append(new_merged_cluster)
|
|
return cluster_predictions
|
|
|
|
|
|
def clean_up_clusters(
|
|
cluster_predictions,
|
|
raw_cells,
|
|
merge_cells=False,
|
|
img_table=False,
|
|
one_cell_table=False,
|
|
):
|
|
DuplicateDeletedClusterIDs = []
|
|
|
|
for cluster_1 in cluster_predictions:
|
|
for cluster_2 in cluster_predictions:
|
|
if cluster_1["id"] != cluster_2["id"]:
|
|
# remove any artifcats created by merging clusters
|
|
if merge_cells == True:
|
|
if contains(
|
|
cluster_1["bbox"],
|
|
[
|
|
cluster_2["bbox"][0] + 3,
|
|
cluster_2["bbox"][1] + 3,
|
|
cluster_2["bbox"][2] - 3,
|
|
cluster_2["bbox"][3] - 3,
|
|
],
|
|
):
|
|
cluster_1["cell_ids"] = (
|
|
cluster_1["cell_ids"] + cluster_2["cell_ids"]
|
|
)
|
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
|
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
|
elif img_table == True:
|
|
if (
|
|
cluster_1["type"] == "Text"
|
|
and cluster_2["type"] == "Picture"
|
|
or cluster_2["type"] == "Table"
|
|
):
|
|
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
|
elif contains(
|
|
[
|
|
cluster_2["bbox"][0] - 3,
|
|
cluster_2["bbox"][1] - 3,
|
|
cluster_2["bbox"][2] + 3,
|
|
cluster_2["bbox"][3] + 3,
|
|
],
|
|
cluster_1["bbox"],
|
|
):
|
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
|
# remove tables that have one pdf cell
|
|
if one_cell_table == True:
|
|
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
|
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
|
|
|
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
|
|
|
for cl_id in DuplicateDeletedClusterIDs:
|
|
for cluster in cluster_predictions:
|
|
if cl_id == cluster["id"]:
|
|
cluster_predictions.remove(cluster)
|
|
return cluster_predictions
|
|
|
|
|
|
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
|
|
for cluster in clusters:
|
|
cells_in_cluster, _ = compute_enclosed_cells(
|
|
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
|
|
)
|
|
cluster["cell_ids"] = cells_in_cluster
|
|
## These cell_ids are ids of the raw cells.
|
|
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
|
|
return clusters
|
|
|
|
|
|
# Creates a map of cell_id->cluster_id
|
|
def cell_id_state_map(clusters, cell_count):
|
|
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
|
|
orphan_cell_indices = [
|
|
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
|
|
] # which cells are assigned no cluster?
|
|
ambiguous_cell_indices = [
|
|
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
|
|
] # which cells are assigned > 1 clusters?
|
|
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
|