I would appreciate any advice related to truncation methods.
There are emojis that consist of many characters (code points) and bytes, which makes them long in terms of character or byte count. However, their display width is only two. Due to this issue, I am seeking a truncation method that truncates strings based on their display width.
Is there already a good method for this purpose, or should I develop a new one?
Inspired by String#truncate()
and String#truncate_bytes()
in Active Support Core Extensions, I am developing the following methods. However, if there is already a better method, I will adopt it.
I will appreciate your comments.
# coding: utf-8
require 'minitest/unit'
require 'unicode/display_width'
module TruncateBy
# Returns the length of a string truncated to length <tt>truncate_to</tt>,
# or nil if +string+ is not longer than <tt>truncate_to</tt>;
#
# TruncateBy.truncation_size('12345678', 7) {|grapheme| grapheme.size}
# # => 7
# TruncateBy.truncation_size('12345678', 8) {|grapheme| grapheme.size}
# # => nil
#
# The unit of return values is the number of characters and
# the unit of <tt>truncate_to</tt> must be the same as the unit of what the block returns:
#
# 'ππ½ππ½ππ½'.bytesize
# # => 24
# TruncateBy.truncation_size('ππ½ππ½ππ½', 16) {|grapheme| grapheme.bytesize}
# # => 4 # the bytesize of the sum of the first 4 characters is 16.
# 'ππ½ππ½ππ½'[0, 4]
# # => "ππ½ππ½"
# 'ππ½ππ½'.bytesize
# # => 16 # equal to truncate_to
#
# Pass <tt>:omission_size</tt> to make room for an omission string:
#
# TruncateBy.truncation_size('12345678', 7, omission_size: 3) {|grapheme| grapheme.size}
# # => 4
#
# The unit of <tt>:omission_size</tt> must be the same as the unit of what the block returns.
#
# Not to break grapheme clusters, a grapheme cluster is given to the block each time and
# the size of string after truncation can be less than <tt>truncate_to</tt>.
#
# TruncateBy.truncation_size('ππ½ππ½ππ½', 20) {|grapheme| grapheme.bytesize}
# # => 4
# 'ππ½ππ½ππ½'[0, 4]
# # => "ππ½ππ½" # string after truncation
# 'ππ½ππ½'.bytesize
# # => 16 # less than truncate_to, which is 20
def truncation_size(string, truncate_to, omission_size: 0)
size = 0
size_chr = 0
string.each_grapheme_cluster do |grapheme|
grapheme_size = yield(grapheme)
size += grapheme_size
if size <= truncate_to - omission_size
size_chr += grapheme.size
elsif size > truncate_to
return size_chr
end
end
nil
end
# Truncates a given +string+ to length <tt>truncate_to</tt> if +string+ is longer than <tt>truncate_to</tt>,
# and the length is counted based on given block:
#
# TruncateBy.truncate_by('Once upon a time in a world far far away', 27) {|grapheme| grapheme.size}
# # => "Once upon a time in a worlβ¦"
#
# Not to break grapheme clusters, a grapheme cluster is given to the block each time and a string returned
# can be shorter than the possible maximum length designated by <tt>truncate_to</tt>.
#
# 'ππ½ππ½ππ½'.size
# # => 6
# TruncateBy.truncate_by('ππ½ππ½ππ½', 4) {|grapheme| grapheme.size}
# # => "ππ½β¦"
# "ππ½β¦".size
# # => 3 # shoter than 4
#
# The tail will be the <tt>:omission</tt> string (defaults to "β¦").
#
# TruncateBy.truncate_by('Once upon a time in a world far far away', 27, omission: '->') {|grapheme| grapheme.size}
# # => "Once upon a time in a wor->"
#
# TruncateBy.truncate_by('Once upon a time in a world far far away', 27, omission: nil) {|grapheme| grapheme.size}
# # => "Once upon a time in a world"
#
# Raises +ArgumentError+ when the length of <tt>:omission</tt> exceeds <tt>truncate_to</tt>.
#
# Pass a string or regexp <tt>:separator</tt> to truncate +string+ at a natural break:
#
# TruncateBy.truncate_by('Once upon a time in a world far far away', 27, separator: ' ') {|grapheme| grapheme.size}
# # => "Once upon a time in aβ¦"
#
# TruncateBy.truncate_by('Once upon a time in a world far far away', 27, separator: /\s/) {|grapheme| grapheme.size}
# # => "Once upon a time in aβ¦"
def truncate_by(string, truncate_to, omission: "β¦", separator: nil, &block)
omission ||= ""
omission_size = yield(omission)
case
when omission_size > truncate_to
raise ArgumentError, "Omission #{omission.inspect} is #{omission.bytesize}, larger than the truncation length of #{truncate_to} bytes"
else
if size_chr =
TruncateBy.truncation_size(string, truncate_to,
omission_size: omission_size,
&block)
stop = (separator && string.rindex(separator, size_chr)) ||
size_chr
+"#{string[0, stop]}#{omission}"
else
string.dup
end
end
end
module_function :truncation_size, :truncate_by
end
class TmpTest < Minitest::Test
def truncate_by_char(string, truncate_to, **kwargs)
if string.size <= truncate_to
string.dup
else
TruncateBy.truncate_by(string, truncate_to, **kwargs) do |grapheme|
grapheme.size
end
end
end
def test_truncate_by_char
assert_equal "Hello World!", truncate_by_char("Hello World!", 12)
assert_equal "Hello Worldβ¦", truncate_by_char("Hello World!!", 12)
assert_equal "Hello W[...]", truncate_by_char("Hello World!!", 12, omission: "[...]")
assert_equal "Helloβ¦", truncate_by_char("Hello World!!", 12, separator: " ")
assert_equal "Helloβ¦", truncate_by_char("Hello World!!", 12, separator: /\s/)
assert_equal "Hello β¦", truncate_by_char("Hello World!!", 12, separator: " ")
assert_equal "Helloβ¦", truncate_by_char("Hello World!!", 12, separator: /(?<!\s)\s+/)
assert_equal "β¦", truncate_by_char(" HelloWorld!!", 12, separator: " ")
assert_equal "β¦", truncate_by_char(" HelloWorld!!", 12, separator: /(?<!\s)\s+/)
assert_equal 6, "ππ½ππ½ππ½".size
assert_equal "ππ½ππ½ππ½", truncate_by_char("ππ½ππ½ππ½", 6)
assert_equal "ππ½ππ½β¦", truncate_by_char("ππ½ππ½ππ½", 5)
end
def truncate_by_byte(string, truncate_to, **kwargs)
if string.bytesize <= truncate_to
string.dup
else
TruncateBy.truncate_by(string, truncate_to, **kwargs) do |grapheme|
grapheme.bytesize
end
end
end
def test_truncate_by_byte
assert_equal 24, "ππ½ππ½ππ½".bytesize
assert_equal "ππ½ππ½ππ½", truncate_by_byte("ππ½ππ½ππ½", 24)
assert_equal "ππ½ππ½β¦", truncate_by_byte("ππ½ππ½ππ½", 23)
end
def truncate_by_display_width(*args, **kwargs)
display_width_opt =
kwargs.delete(:display_width) || {ambiguous: 2,
emoji: true}
display_width =
Unicode::DisplayWidth.new(**display_width_opt)
TruncateBy.truncate_by(*args, **kwargs) do |grapheme|
display_width.of grapheme
end
end
def test_truncate_by_display_width
assert_equal "ππ½ππ½ππ½", truncate_by_display_width("ππ½ππ½ππ½", 6, omission: nil)
assert_equal "ππ½ππ½", truncate_by_display_width("ππ½ππ½ππ½", 5, omission: nil)
end
end