# pylint: disable=invalid-name,too-many-branches,too-many-statements,too-many-arguments
import os
import io
import typing
import hashlib
import urllib.request
import urllib.parse
import cv2
import imgaug
import numpy as np
import validators
import typing_extensions as tx
import matplotlib.pyplot as plt
from shapely import geometry
from scipy import spatial
[docs]def read(filepath_or_buffer: typing.Union[str, io.BytesIO]):
"""Read a file into an image object
Args:
filepath_or_buffer: The path to the file, a URL, or any object
with a `read` method (such as `io.BytesIO`)
"""
if isinstance(filepath_or_buffer, np.ndarray):
return filepath_or_buffer
if hasattr(filepath_or_buffer, "read"):
image = np.asarray(bytearray(filepath_or_buffer.read()), dtype=np.uint8) # type: ignore
image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
elif isinstance(filepath_or_buffer, str):
if validators.url(filepath_or_buffer):
return read(urllib.request.urlopen(filepath_or_buffer))
assert os.path.isfile(filepath_or_buffer), (
"Could not find image at path: " + filepath_or_buffer
)
image = cv2.imread(filepath_or_buffer)
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
[docs]def get_rotated_width_height(box):
"""
Returns the width and height of a rotated rectangle
Args:
box: A list of four points starting in the top left
corner and moving clockwise.
"""
w = (
spatial.distance.cdist(box[0][np.newaxis], box[1][np.newaxis], "euclidean")
+ spatial.distance.cdist(box[2][np.newaxis], box[3][np.newaxis], "euclidean")
) / 2
h = (
spatial.distance.cdist(box[0][np.newaxis], box[3][np.newaxis], "euclidean")
+ spatial.distance.cdist(box[1][np.newaxis], box[2][np.newaxis], "euclidean")
) / 2
return int(w[0][0]), int(h[0][0])
# pylint:disable=too-many-locals
[docs]def warpBox(
image,
box,
target_height=None,
target_width=None,
margin=0,
cval=None,
return_transform=False,
skip_rotate=False,
):
"""Warp a boxed region in an image given by a set of four points into
a rectangle with a specified width and height. Useful for taking crops
of distorted or rotated text.
Args:
image: The image from which to take the box
box: A list of four points starting in the top left
corner and moving clockwise.
target_height: The height of the output rectangle
target_width: The width of the output rectangle
return_transform: Whether to return the transformation
matrix with the image.
"""
if cval is None:
cval = (0, 0, 0) if len(image.shape) == 3 else 0
if not skip_rotate:
box, _ = get_rotated_box(box)
w, h = get_rotated_width_height(box)
assert (target_width is None and target_height is None) or (
target_width is not None and target_height is not None
), "Either both or neither of target width and height must be provided."
if target_width is None and target_height is None:
target_width = w
target_height = h
scale = min(target_width / w, target_height / h)
M = cv2.getPerspectiveTransform(
src=box,
dst=np.array(
[
[margin, margin],
[scale * w - margin, margin],
[scale * w - margin, scale * h - margin],
[margin, scale * h - margin],
]
).astype("float32"),
)
crop = cv2.warpPerspective(image, M, dsize=(int(scale * w), int(scale * h)))
target_shape = (
(target_height, target_width, 3)
if len(image.shape) == 3
else (target_height, target_width)
)
full = (np.zeros(target_shape) + cval).astype("uint8")
full[: crop.shape[0], : crop.shape[1]] = crop
if return_transform:
return full, M
return full
def flatten(list_of_lists):
return [item for sublist in list_of_lists for item in sublist]
[docs]def combine_line(line):
"""Combine a set of boxes in a line into a single bounding
box.
Args:
line: A list of (box, character) entries
Returns:
A (box, text) tuple
"""
text = "".join(
[character if character is not None else "" for _, character in line]
)
box = np.concatenate(
[coords[:2] for coords, _ in line]
+ [np.array([coords[3], coords[2]]) for coords, _ in reversed(line)]
).astype("float32")
first_point = box[0]
rectangle = cv2.minAreaRect(box)
box = cv2.boxPoints(rectangle)
# Put the points in clockwise order
box = np.array(np.roll(box, -np.linalg.norm(box - first_point, axis=1).argmin(), 0))
return box, text
[docs]def drawAnnotations(image, predictions, ax=None):
"""Draw text annotations onto image.
Args:
image: The image on which to draw
predictions: The predictions as provided by `pipeline.recognize`.
ax: A matplotlib axis on which to draw.
"""
if ax is None:
_, ax = plt.subplots()
ax.imshow(drawBoxes(image=image, boxes=predictions, boxes_format="predictions"))
predictions = sorted(predictions, key=lambda p: p[1][:, 1].min())
left = []
right = []
for word, box in predictions:
if box[:, 0].min() < image.shape[1] / 2:
left.append((word, box))
else:
right.append((word, box))
ax.set_yticks([])
ax.set_xticks([])
for side, group in zip(["left", "right"], [left, right]):
for index, (text, box) in enumerate(group):
y = 1 - (index / len(group))
xy = box[0] / np.array([image.shape[1], image.shape[0]])
xy[1] = 1 - xy[1]
ax.annotate(
text=text,
xy=xy,
xytext=(-0.05 if side == "left" else 1.05, y),
xycoords="axes fraction",
arrowprops={"arrowstyle": "->", "color": "r"},
color="r",
fontsize=14,
horizontalalignment="right" if side == "left" else "left",
)
return ax
[docs]def drawBoxes(image, boxes, color=(255, 0, 0), thickness=5, boxes_format="boxes"):
"""Draw boxes onto an image.
Args:
image: The image on which to draw the boxes.
boxes: The boxes to draw.
color: The color for each box.
thickness: The thickness for each box.
boxes_format: The format used for providing the boxes. Options are
"boxes" which indicates an array with shape(N, 4, 2) where N is the
number of boxes and each box is a list of four points) as provided
by `keras_ocr.detection.Detector.detect`, "lines" (a list of
lines where each line itself is a list of (box, character) tuples) as
provided by `keras_ocr.data_generation.get_image_generator`,
or "predictions" where boxes is by itself a list of (word, box) tuples
as provided by `keras_ocr.pipeline.Pipeline.recognize` or
`keras_ocr.recognition.Recognizer.recognize_from_boxes`.
"""
if len(boxes) == 0:
return image
canvas = image.copy()
if boxes_format == "lines":
revised_boxes = []
for line in boxes:
for box, _ in line:
revised_boxes.append(box)
boxes = revised_boxes
if boxes_format == "predictions":
revised_boxes = []
for _, box in boxes:
revised_boxes.append(box)
boxes = revised_boxes
for box in boxes:
cv2.polylines(
img=canvas,
pts=box[np.newaxis].astype("int32"),
color=color,
thickness=thickness,
isClosed=True,
)
return canvas
@typing.overload
def adjust_boxes(
boxes, scale=1, boxes_format: tx.Literal["lines"] = "lines"
) -> typing.List[typing.Tuple[np.ndarray, str]]:
...
@typing.overload
def adjust_boxes(
boxes, scale=1, boxes_format: tx.Literal["predictions"] = "predictions"
) -> typing.List[typing.Tuple[str, np.ndarray]]:
...
@typing.overload
def adjust_boxes(
boxes, scale=1, boxes_format: tx.Literal["boxes"] = "boxes"
) -> np.ndarray:
...
[docs]def adjust_boxes(
boxes,
scale=1,
boxes_format: tx.Literal["boxes", "predictions", "lines"] = "boxes",
) -> typing.Union[
np.ndarray,
typing.List[typing.Tuple[np.ndarray, str]],
typing.List[typing.Tuple[str, np.ndarray]],
]:
"""Adjust boxes using a given scale and offset.
Args:
boxes: The boxes to adjust
boxes_format: The format for the boxes. See the `drawBoxes` function
for an explanation on the options.
scale: The scale to apply
"""
if scale == 1:
return boxes
if boxes_format == "boxes":
return np.array(boxes) * scale
if boxes_format == "lines":
return [
[(np.array(box) * scale, character) for box, character in line]
for line in boxes
]
if boxes_format == "predictions":
return [(word, np.array(box) * scale) for word, box in boxes]
raise NotImplementedError(f"Unsupported boxes format: {boxes_format}")
[docs]def augment(
boxes,
augmenter: imgaug.augmenters.meta.Augmenter,
image=None,
boxes_format="boxes",
image_shape=None,
area_threshold=0.5,
min_area=None,
):
"""Augment an image and associated boxes together.
Args:
image: The image which we wish to apply the augmentation.
boxes: The boxes that will be augmented together with the image
boxes_format: The format for the boxes. See the `drawBoxes` function
for an explanation on the options.
image_shape: The shape of the input image if no image will be provided.
area_threshold: Fraction of bounding box that we require to be
in augmented image to include it.
min_area: The minimum area for a character to be included.
"""
if image is None and image_shape is None:
raise ValueError('One of "image" or "image_shape" must be provided.')
augmenter = augmenter.to_deterministic()
if image is not None:
image_augmented = augmenter(image=image)
image_shape = image.shape[:2]
image_augmented_shape = image_augmented.shape[:2]
else:
image_augmented = None
width_augmented, height_augmented = augmenter.augment_keypoints(
imgaug.KeypointsOnImage.from_xy_array(
xy=[[image_shape[1], image_shape[0]]], shape=image_shape
)
).to_xy_array()[0]
image_augmented_shape = (height_augmented, width_augmented)
def box_inside_image(box):
area_before = cv2.contourArea(np.int32(box)[:, np.newaxis, :])
if area_before == 0:
return False, box
clipped = box.copy()
clipped[:, 0] = clipped[:, 0].clip(0, image_augmented_shape[1])
clipped[:, 1] = clipped[:, 1].clip(0, image_augmented_shape[0])
area_after = cv2.contourArea(np.int32(clipped)[:, np.newaxis, :])
return ((area_after / area_before) >= area_threshold) and (
min_area is None or area_after > min_area
), clipped
def augment_box(box):
return augmenter.augment_keypoints(
imgaug.KeypointsOnImage.from_xy_array(box, shape=image_shape)
).to_xy_array()
if boxes_format == "boxes":
boxes_augmented = [
box
for inside, box in [
box_inside_image(box) for box in map(augment_box, boxes)
]
if inside
]
elif boxes_format == "lines":
boxes_augmented = [
[(augment_box(box), character) for box, character in line] for line in boxes
]
boxes_augmented = [
[
(box, character)
for (inside, box), character in [
(box_inside_image(box), character) for box, character in line
]
if inside
]
for line in boxes_augmented
]
# Sometimes all the characters in a line are removed.
boxes_augmented = [line for line in boxes_augmented if line]
elif boxes_format == "predictions":
boxes_augmented = [(word, augment_box(box)) for word, box in boxes]
boxes_augmented = [
(word, box)
for word, (inside, box) in [
(word, box_inside_image(box)) for word, box in boxes_augmented
]
if inside
]
else:
raise NotImplementedError(f"Unsupported boxes format: {boxes_format}")
return image_augmented, boxes_augmented
[docs]def pad(image, width: int, height: int, cval: int = 255):
"""Pad an image to a desired size. Raises an exception if image
is larger than desired size.
Args:
image: The input image
width: The output width
height: The output height
cval: The value to use for filling the image.
"""
output_shape: typing.Union[typing.Tuple[int, int, int], typing.Tuple[int, int]]
if len(image.shape) == 3:
output_shape = (height, width, image.shape[-1])
else:
output_shape = (height, width)
assert height >= output_shape[0], "Input height must be less than output height."
assert width >= output_shape[1], "Input width must be less than output width."
padded = np.zeros(output_shape, dtype=image.dtype) + cval
padded[: image.shape[0], : image.shape[1]] = image
return padded
[docs]def resize_image(image, max_scale, max_size):
"""Obtain the optimal resized image subject to a maximum scale
and maximum size.
Args:
image: The input image
max_scale: The maximum scale to apply
max_size: The maximum size to return
"""
if max(image.shape) * max_scale > max_size:
# We are constrained by the maximum size
scale = max_size / max(image.shape)
else:
# We are contrained by scale
scale = max_scale
return (
cv2.resize(
image, dsize=(int(image.shape[1] * scale), int(image.shape[0] * scale))
),
scale,
)
# pylint: disable=too-many-arguments
[docs]def fit(
image,
width: int,
height: int,
cval: int = 255,
mode="letterbox",
return_scale=False,
):
"""Obtain a new image, fit to the specified size.
Args:
image: The input image
width: The new width
height: The new height
cval: The constant value to use to fill the remaining areas of
the image
return_scale: Whether to return the scale used for the image
Returns:
The new image
"""
fitted = None
x_scale = width / image.shape[1]
y_scale = height / image.shape[0]
if x_scale == 1 and y_scale == 1:
fitted = image
scale = 1
elif (x_scale <= y_scale and mode == "letterbox") or (
x_scale >= y_scale and mode == "crop"
):
scale = width / image.shape[1]
resize_width = width
resize_height = (width / image.shape[1]) * image.shape[0]
else:
scale = height / image.shape[0]
resize_height = height
resize_width = scale * image.shape[1]
if fitted is None:
resize_width, resize_height = map(int, [resize_width, resize_height])
if mode == "letterbox":
fitted = np.zeros((height, width, 3), dtype="uint8") + cval
image = cv2.resize(image, dsize=(resize_width, resize_height))
fitted[: image.shape[0], : image.shape[1]] = image[:height, :width]
elif mode == "crop":
image = cv2.resize(image, dsize=(resize_width, resize_height))
fitted = image[:height, :width]
else:
raise NotImplementedError(f"Unsupported mode: {mode}")
if not return_scale:
return fitted
return fitted, scale
[docs]def read_and_fit(
filepath_or_array: typing.Union[str, np.ndarray],
width: int,
height: int,
cval: int = 255,
mode="letterbox",
):
"""Read an image from disk and fit to the specified size.
Args:
filepath: The path to the image or numpy array of shape HxWx3
width: The new width
height: The new height
cval: The constant value to use to fill the remaining areas of
the image
mode: The mode to pass to "fit" (crop or letterbox)
Returns:
The new image
"""
image = (
read(filepath_or_array)
if isinstance(filepath_or_array, str)
else filepath_or_array
)
image = fit(image=image, width=width, height=height, cval=cval, mode=mode)
return image
[docs]def sha256sum(filename):
"""Compute the sha256 hash for a file."""
h = hashlib.sha256()
b = bytearray(128 * 1024)
mv = memoryview(b)
with open(filename, "rb", buffering=0) as f:
for n in iter(lambda: f.readinto(mv), 0): # type: ignore
h.update(mv[:n])
return h.hexdigest()
def get_default_cache_dir():
return os.environ.get(
"KERAS_OCR_CACHE_DIR", os.path.expanduser(os.path.join("~", ".keras-ocr"))
)
[docs]def download_and_verify(url, sha256=None, cache_dir=None, verbose=True, filename=None):
"""Download a file to a cache directory and verify it with a sha256
hash.
Args:
url: The file to download
sha256: The sha256 hash to check. If the file already exists and the hash
matches, we don't download it again.
cache_dir: The directory in which to cache the file. The default is
`~/.keras-ocr`.
verbose: Whether to log progress
filename: The filename to use for the file. By default, the filename is
derived from the URL.
"""
if cache_dir is None:
cache_dir = get_default_cache_dir()
if filename is None:
filename = os.path.basename(urllib.parse.urlparse(url).path)
filepath = os.path.join(cache_dir, filename)
os.makedirs(os.path.split(filepath)[0], exist_ok=True)
if verbose:
print("Looking for " + filepath)
if not os.path.isfile(filepath) or (sha256 and sha256sum(filepath) != sha256):
if verbose:
print("Downloading " + filepath)
urllib.request.urlretrieve(url, filepath)
assert sha256 is None or sha256 == sha256sum(
filepath
), "Error occurred verifying sha256."
return filepath
[docs]def get_rotated_box(
points,
) -> typing.Tuple[np.ndarray, float,]:
"""Obtain the parameters of a rotated box.
Returns:
The vertices of the rotated box in top-left,
top-right, bottom-right, bottom-left order along
with the angle of rotation about the bottom left corner.
"""
try:
mp = geometry.MultiPoint(points=points)
pts = np.array(list(zip(*mp.minimum_rotated_rectangle.exterior.xy)))[
:-1
] # noqa: E501
except AttributeError:
# There weren't enough points for the minimum rotated rectangle function
pts = points
# The code below is taken from
# https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
# sort the points based on their x-coordinates
xSorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
leftMost = xSorted[:2, :]
rightMost = xSorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
(tl, bl) = leftMost
# now that we have the top-left coordinate, use it as an
# anchor to calculate the Euclidean distance between the
# top-left and right-most points; by the Pythagorean
# theorem, the point with the largest distance will be
# our bottom-right point
D = spatial.distance.cdist(tl[np.newaxis], rightMost, "euclidean")[0]
(br, tr) = rightMost[np.argsort(D)[::-1], :]
# return the coordinates in top-left, top-right,
# bottom-right, and bottom-left order
pts = np.array([tl, tr, br, bl], dtype="float32")
rotation = np.arctan((tl[0] - bl[0]) / (tl[1] - bl[1]))
return pts, rotation
[docs]def fix_line(line):
"""Given a list of (box, character) tuples, return a revised
line with a consistent ordering of left-to-right or top-to-bottom,
with each box provided with (top-left, top-right, bottom-right, bottom-left)
ordering.
Returns:
A tuple that is the fixed line as well as a string indicating
whether the line is horizontal or vertical.
"""
line = [(get_rotated_box(box)[0], character) for box, character in line]
centers = np.array([box.mean(axis=0) for box, _ in line])
sortedx = centers[:, 0].argsort()
sortedy = centers[:, 1].argsort()
if np.diff(centers[sortedy][:, 1]).sum() > np.diff(centers[sortedx][:, 0]).sum():
return [line[idx] for idx in sortedy], "vertical"
return [line[idx] for idx in sortedx], "horizontal"