1 ##############################################################################
2 # $URL: http://perlcritic.tigris.org/svn/perlcritic/trunk/Perl-Critic/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm $
3 # $Date: 2008-07-03 10:19:10 -0500 (Thu, 03 Jul 2008) $
6 ##############################################################################
8 package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes;
15 use English qw(-no_match_vars);
18 use Perl::Critic::Utils qw{ :booleans :severities };
19 use Perl::Critic::Utils::PPIRegexp qw{ parse_regexp get_match_string get_modifiers };
20 use base 'Perl::Critic::Policy';
22 our $VERSION = '1.088';
24 #-----------------------------------------------------------------------------
26 Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks};
27 Readonly::Scalar my $EXPL => [261];
29 #-----------------------------------------------------------------------------
31 sub supported_parameters {
34 name => 'max_characters',
36 'The maximum number of characters to allow in a regular expression.',
37 default_string => '60',
38 behavior => 'integer',
44 sub default_severity { return $SEVERITY_MEDIUM }
45 sub default_themes { return qw( core pbp maintenance ) }
46 sub applies_to { return qw(PPI::Token::Regexp::Match
47 PPI::Token::Regexp::Substitute
48 PPI::Token::QuoteLike::Regexp) }
50 #-----------------------------------------------------------------------------
53 my ( $self, $elem, undef ) = @_;
55 # Optimization: if its short enough now, parsing won't make it longer
56 return if $self->{_max_characters} >= length get_match_string($elem);
58 # If it has an "x" flag, it might be shorter after comment and whitespace removal
59 my %modifiers = get_modifiers($elem);
61 my $re = parse_regexp($elem);
62 return if !$re; # syntax error, abort
65 # HACK: Remove any (?xism:...) wrapper we may have added in the parse process...
66 $qr =~ s/\A [(][?][xism]+(?:-[xism]+)?: (.*) [)] \z/$1/xms;
68 # Hack: don't count long \p{...} expressions against us so badly
69 $qr =~ s/\\[pP][{]\w+[}]/\\p{...}/gmx;
71 return if $self->{_max_characters} >= length $qr;
74 return $self->violation( $DESC, $EXPL, $elem );
81 #-----------------------------------------------------------------------------
85 =for stopwords BNF Tatsuhiko Miyagawa
89 Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks.
93 This Policy is part of the core L<Perl::Critic> distribution.
98 Big regexps are hard to read, perhaps even the hardest part of Perl.
99 A good practice to write digestible chunks of regexp and put them
100 together. This policy flags any regexp that is longer than C<N>
101 characters, where C<N> is a configurable value that defaults to 60.
102 If the regexp uses the C<x> flag, then the length is computed after
103 parsing out any comments or whitespace.
107 As an example, look at the regexp used to match email
108 addresses in L<Email::Valid::Loose> (tweaked lightly to wrap for POD)
110 (?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]
111 \000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015
112 "]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[
113 \]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n
114 \015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,
115 ;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
116 )(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
117 -\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)
119 which is constructed from the following code:
126 my $nonASCII = '\x80-\xff';
127 my $ctrl = '\000-\037';
128 my $cr_list = '\n\015';
129 my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # "
130 my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
131 my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
132 my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# "
133 my $atom = qq<$atom_char+(?!$atom_char)>;
134 my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # "
135 my $word = qq<(?:$atom|$quoted_str)>;
136 my $domain_ref = $atom;
137 my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
138 my $sub_domain = qq<(?:$domain_ref|$domain_lit)>;
139 my $domain = qq<$sub_domain(?:$period$sub_domain)*>;
140 my $local_part = qq<$word(?:$word|$period)*>; # This part is modified
141 $Addr_spec_re = qr<$local_part\@$domain>;
143 If you read the code from bottom to top, it is quite readable. And,
144 you can even see the one violation of RFC822 that Tatsuhiko Miyagawa
145 deliberately put into Email::Valid::Loose to allow periods. Look for
146 the C<|\.> in the upper regexp to see that same deviation.
148 One could certainly argue that the top regexp could be re-written more
149 legibly with C<m//x> and comments. But the bottom version is
150 self-documenting and, for example, doesn't repeat C<\x80-\xff> 18
151 times. Furthermore, it's much easier to compare the second version
152 against the source BNF grammar in RFC 822 to judge whether the
153 implementation is sound even before running tests.
157 This policy allows regexps up to C<N> characters long, where C<N>
158 defaults to 60. You can override this to set it to a different number
159 with the C<max_characters> setting. To do this, put entries in a
160 F<.perlcriticrc> file like this:
162 [RegularExpressions::ProhibitComplexRegexes]
167 Initial development of this policy was supported by a grant from the Perl Foundation.
171 Chris Dolan <cdolan@cpan.org>
175 Copyright (c) 2007-2008 Chris Dolan. Many rights reserved.
177 This program is free software; you can redistribute it and/or modify
178 it under the same terms as Perl itself. The full text of this license
179 can be found in the LICENSE file included with this module
185 # cperl-indent-level: 4
187 # indent-tabs-mode: nil
188 # c-indentation-style: bsd
190 # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :