diff --git a/Third_Party_Open_Source_Software_Notice b/Third_Party_Open_Source_Software_Notice index f12554c08f6..f2ca5e25bfc 100644 --- a/Third_Party_Open_Source_Software_Notice +++ b/Third_Party_Open_Source_Software_Notice @@ -3057,6 +3057,587 @@ Software: tinyxml2 8.0.0 Copyright 2011, John Resig. Copyright 2011, The Dojo Foundation. +Software: icu 67.1 +Copyright (C) 2000-2004, International Business Machines Corporation +Copyright (C) 2002-2014, International Business Machines(C) Copyright IBM Corp. 1998-2011 - All Rights Reserved +Copyright (C) 2003-2008, International Business Machines +Copyright (C) 2005-2006, International Business Machines +Copyright (C) 2016 and later: Unicode, Inc. and others. +Copyright (c) 2001-2010 International Business Machines +Copyright (C) 2009, International Business Machines +Copyright (c) 2010-2015 International Business Machines Corporation and others. All rights reserved. +Copyright (C) 2002-2015, International Business Machines verbatim (minus copyright and #include) and copied together into this file. +Copyright (c) 1997-2014, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 1997-2008, International Business Machines Corporation and +Copyright (c) 1997-2003, International Business Machines Corporation and +Copyright (c) 1996-2012, International Business Machines Corporation and +Copyright (c) 1997-2016, International Business Machines +Copyright (c) 1997-2013 International Business Machines +Copyright (c) 1997-2016, International Business Machines Corporation and +Copyright (c) 1997-2001, International Business Machines Corporation and +Copyright (c) 1997-2012, International Business Machines Corporation and +Copyright (c) 1997-2005, International Business Machines Corporation and +Copyright (c) 1997-2010, International Business Machines Corporation and +Copyright (c) 2011-2016, International Business Machines Corporation +Copyright (c) 1997-2009, International Business Machines Corporation and +Copyright (c) 1997-2002,2008, International Business Machines Corporation and +Copyright (c) 1997-2009,2014, International Business Machines +Copyright (C) 2000-2009, International Business Machines +Copyright (c) 1997-2015, International Business Machines Corporation and +Copyright (c) 1997-2013, International Business Machines Corporation and +Copyright (c) 2001-2016, International Business Machines Corporation and +Copyright (c) 1997-2016, International Business Machines Corporation +Copyright (c) 1997-2003, 2007-2009 International Business Machines Corporation and +Copyright (c) 2011-2014, International Business Machines Corporation +Copyright (c) 2003-2009, International Business Machines +Copyright (c) 2016, International Business Machines Corporation +Copyright (c) 1997-2004, International Business Machines Corporation and +Copyright (C) 2002-2016, International Business Machines +Copyright (C) 1998-2014, International Business Machines Corporation +Copyright (c) 2003-2013, International Business Machines Corporation and +Copyright (c) 2005-2016, International Business Machines Corporation and +Copyright (c) 1999-2013, International Business Machines Corporation and +Copyright (c) 2003-2015, International Business Machines Corporation and +Copyright (C) 2003-2016, International Business Machines +Copyright (C) 2003-2014, International Business Machines +Copyright (C) 2003, International Business Machines +Copyright (c) 1998-2016, International Business Machines Corporation and +Copyright (c) 2004-2015, International Business Machines Corporation and +Copyright (c) 2009-2016, International Business Machines Corporation and +Copyright (C) 2003-2012, International Business Machines +Copyright (c) 2000-2016, International Business Machines Corporation and +Copyright (C) 2001-2014, International Business Machines +Copyright (C) 2001-2016, International Business Machines +Copyright (c) 1997-2014, International Business Machines © 2017 and later: Unicode, Inc. and others. +Copyright (C) 2007-2016, International Business Machines © 2018 and later: Unicode, Inc. and others. +Copyright (c) 2015, International Business Machines Corporation +Copyright (c) 2014-2016, International Business Machines Corporation +Copyright (c) 2002-2016, International Business Machines +Copyright (c) 2001-2011,2015 International Business Machines +Copyright (c) 2001-2016 International Business Machines +Copyright (c) 2005-2013, International Business Machines Corporation and +Copyright (c) 1998-2014, International Business Machines Corporation and +Copyright (C) 1997-2016 International Business Machines +Copyright (C) 2009-2014, International Business Machines Corporation and +Copyright (c) 2002-2014, International Business Machines Corporation +Copyright (c) 2002-2007, International Business Machines Corporation +Copyright (C) 1996-2012, International Business Machines Corporation +Copyright (C) 1996-2008, International Business Machines Corporation +Copyright (C) 2007-2013, International Business Machines Corporation and +Copyright (C) 2008-2015, International Business Machines +Copyright (C) 2003-2013, International Business Machines Corporation and +Copyright (C) 2003-2013, International Business Machines Corporation +Copyright (C) 1997-2016, International Business Machines Corporation and +Copyright (C) 2001-2011, International Business Machines +Copyright (C) 2001-2008, International Business Machines +Copyright (C) 2003 - 2009, International Business Machines Corporation and +Copyright (C) 2003 - 2008, International Business Machines Corporation and +Copyright (C) 2007-2014, International Business Machines Corporation +Copyright (C) 2007-2013, International Business Machines Corporation +Copyright (C) 1997-2013, International Business Machines Corporation and +Copyright (C) 1996-2014, International Business Machines Corporation and +Copyright (C) 2010-2014, International Business Machines +Copyright (C) 2010-2015, International Business Machines +Copyright (C) 2013-2014, International Business Machines +Copyright (C) 1996-2015, International Business Machines +Copyright (C) 1996-2014, International Business Machines +Copyright (C) 2012-2015, International Business Machines +Copyright (C) 2012-2014, International Business Machines +Copyright (C) 2013-2015, International Business Machines +Copyright (C) 2013-2016, International Business Machines +Copyright (C) 1999-2016, International Business Machines +Copyright (C) 1999-2015, International Business Machines +Copyright (C) 1999-2014, International Business Machines +Copyright (C) 2015-2016, International Business Machines Corporation and others. +Copyright (C) 2003 - 2013, International Business Machines Corporation and +Copyright (C) 1999-2011, International Business Machines +Copyright (C) 2005-2016, International Business Machines +Copyright (C) 2005-2012, International Business Machines +Copyright (C) 2005-2015, International Business Machines +Copyright (C) 2005-2013, International Business Machines +Copyright (C) 2005-2014, International Business Machines +Copyright (c) 2004, International Business Machines +Copyright (c) 2004-2014 International Business Machines +Copyright (c) 2004-2014, International Business Machines +Copyright (C) 2013, International Business Machines Corporation +Copyright (C) 1997-2015, International Business Machines Corporation and +Copyright (C) 2016, International Business Machines +Copyright (c) IBM Corporation, 2000-2012. All rights reserved. +Copyright (c) IBM Corporation, 2000-2011. All rights reserved. +Copyright (c) IBM Corporation, 2000-2014. All rights reserved. +Copyright (c) IBM Corporation, 2000-2010. All rights reserved. +Copyright (c) IBM Corporation, 2000-2016. All rights reserved. +Copyright 2010 the V8 project authors. All rights reserved. +Copyright 2006-2008 the V8 project authors. All rights reserved. +Copyright 2012 the V8 project authors. All rights reserved. +Copyright (C) 2008-2016, International Business Machines Corporation and +Copyright (C) 2007-2016, International Business Machines Corporation and +Copyright (C) 2007-2012, International Business Machines Corporation and +Copyright (c) 2001-2011, International Business Machines +Copyright (c) 2001-2007, International Business Machines +Copyright (C) 2010-2014, International Business Machines Corporation and +Copyright (C) 1997-2010, International Business Machines Corporation and +Copyright (C) 1997-2012, International Business Machines Corporation and +Copyright (C) 2009-2015, International Business Machines Corporation and +Copyright (C) 2009-2012, International Business Machines Corporation and +Copyright (c) 2002-2012, International Business Machines Corporation +Copyright (c) 2002-2011, International Business Machines Corporation +Copyright (C) 2008-2013, International Business Machines Corporation and +Copyright (c) 2003-2008, International Business Machines +Copyright (C) 2003-2016, International Business Machines Corporation +Copyright (C) 2003-2014, International Business Machines Corporation +Copyright (C) 2003-2008, International Business Machines Corporation +Copyright (C) 2005-2008, International Business Machines +Copyright (C) 2003-2015, International Business Machines Corporation +Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and +Copyright (c) 2004-2016, International Business Machines © 2020 and later: Unicode, Inc. and others. +Copyright (C) 2007-2008, International Business Machines Corporation and +Copyright (C) 2001-2007, International Business Machines +Copyright (C) 1997-2012, International Business Machines +Copyright (C) 1997-2015, International Business Machines +Copyright (C) 2001-2010, International Business Machines +Copyright (c) 2000-2005, International Business Machines +Copyright (c) 2000-2007, International Business Machines © 2019 and later: Unicode, Inc. and others. +Copyright (C) 2010-2015, International Business Machines Corporation and +Copyright (C) 2015, International Business Machines Corporation and +Copyright (c) 2003-2013, International Business Machines +Copyright (C) 2001-2012, International Business Machines +Copyright (C) 2001-2011, International Business Machines Corporation +Copyright (C) 2014-2016, International Business Machines +Copyright (C) 1997-2015, International Business Machines Corporation +Copyright (C) 1999-2007, International Business Machines +Copyright (C) 1999-2007, International Business Machines Corporation +Copyright (C) 1999-2011, International Business Machines Corporation +Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2002-2016 International Business Machines Corporation and others. +Copyright (C) 2002-2016, International Business Machines Corporation and others. +Copyright (C) 2002-2016 International Business Machines Corporation +Copyright (C) 2002-2015, International Business Machines Corporation and others. +Copyright (C) 2012 International Business Machines Corporation +Copyright (C) 2002-2015 International Business Machines Corporation +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright (C) 2003-2010, International Business Machines Corporation and others. +Copyright (c) 2008-2011, International Business Machines Corporation and +Copyright (c) 2008-2010, International Business Machines Corporation and +Copyright (C) 2014-2016, International Business Machines Corporation and +Copyright (C) 2013, International Business Machines Corporation and +Copyright (c) 2014, International Business Machines +Copyright (C) 2014, International Business Machines +Copyright (C) 2013, International Business Machines +Copyright (C) 2001-2008,2010 IBM and others. All rights reserved. +Copyright (C) 2010 , Yahoo! Inc. +Copyright (c) 1997-2011, International Business Machines Corporation and +Copyright (C) 2013-2014, International Business Machines Corporation and +Copyright (C) 2009-2013, International Business Machines Corporation and +Copyright (C) 1996-2012, International Business Machines Corporation and +Copyright (C) 2015, International Business Machines Corporation +Copyright (c) 2001-2012, International Business Machines Corporation +Copyright (C) 2001-2014 IBM and others. All rights reserved. +Copyright (C) 2008-2014, Google, International Business Machines Corporation and +Copyright (C) 2008, Google, International Business Machines Corporation and +Copyright (C) 2008-2015, Google, International Business Machines Corporation +Copyright (c) 2001-2014, International Business Machines +Copyright (c) 2002-2010, International Business Machines Corporation +Copyright (C) 2011-2015, International Business Machines Corporation and +Copyright (C) 2011-2016, International Business Machines Corporation and +Copyright (C) 2011-2012, International Business Machines Corporation and +Copyright (C) 1996-2016, International Business Machines +Copyright (C) 1998-2014, International Business Machines +Copyright (C) 2004-2016, International Business Machines +Copyright (C) 2010-2011, International Business Machines +Copyright (C) 2009-2015, International Business Machines +Copyright (C) 2015, International Business Machines +Copyright (C) 2012-2016, International Business Machines +Copyright (C) 1999-2012, International Business Machines +Copyright (C) 2001, International Business Machines +Copyright (C) 2013, International Business Machines Corporation and others. +Copyright (C) 2010-2012, International Business Machines +Copyright (C) 2004-2015, International Business Machines +Copyright (C) 2003-2006, International Business Machines +Copyright (C) 2013-2015, International Business Machines Corporation and others. +Copyright (C) 2001-2015 IBM and others. All rights reserved. +Copyright (C) 2008-2015, International Business Machines Corporation +Copyright (C) 2008-2016, International Business Machines +Copyright (C) 2008-2013, International Business Machines Corporation +Copyright (C) 2004-2012, International Business Machines Corporation and +Copyright (C) 1997-2009,2014 International Business Machines +Copyright (C) 2009-2011, International Business Machines Corporation and +Copyright (C) 2009-2016, International Business Machines Corporation and +Copyright (C) 2009-2013, International Business Machines +Copyright (C) 2008-2011, International Business Machines +Copyright (C) 2007-2014, International Business Machines Corporation and +Copyright (C) 2009-2010, International Business Machines Corporation and +Copyright (C) 2001-2016 International Business Machines Corporation +Copyright (c) 2002-2011, International Business Machines +Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. +Copyright (c) 2013-2016 International Business Machines Corporation and others. All rights reserved. +Copyright (c) 2013-2015 International Business Machines Corporation and others. All rights reserved. +Copyright (c) 2007-2012, International Business Machines Corporation and +Copyright (c) 2007-2012, International Business Machines +Copyright (C) 2010, International Business Machines +Copyright (C) 1997-2011, International Business Machines +Copyright (C) 1997-2005, International Business Machines +Copyright (C) 2009-2011, International Business Machines +Copyright (C) 2003-2015, International Business Machines +Copyright (C) 2009-2016, International Business Machines +Copyright (C) 2008-2012, International Business Machines +Copyright (C) 2008, International Business Machines +Copyright (C) 2011-2014, International Business Machines +Copyright (C) 2011-2013, International Business Machines +Copyright (C) 2005, International Business Machines +Copyright (C) 1999-2013, International Business Machines +Copyright (C) 1998-2016, International Business Machines +Copyright (c) 2007-2014, International Business Machines Corporation and +Copyright (C) 2003-2013, International Business Machines +Copyright (c) 2007-2016, International Business Machines Corporation and +Copyright (c) 2008-2015, International Business Machines +Copyright (C) 1999-2010, International Business Machines +Copyright (C) 2000-2015, International Business Machines +Copyright (C) 2000-2011, International Business Machines +Copyright (C) 2000-2012, International Business Machines +Copyright (C) 2000-2010, International Business Machines +Copyright (C) 2004-2010, International Business Machines +Copyright (C) 2004-2005, International Business Machines +Copyright (c) 2013-2014, International Business Machines +Copyright (c) 1991-2013 Unicode, Inc. © 2019 Unicode®, Inc. +Copyright (C) 2018 and later: Unicode, Inc. and others. +Copyright (c) 2008-2013 International Business Machines +Copyright (C) 2002-2010, International Business Machines +Copyright (c) 2012-2015 International Business Machines © 2020 Unicode®, Inc. +Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved +Copyright (c) 2011-2012, International Business Machines Corporation and +Copyright (C) 1998-2000, International Business Machines © 2017 Unicode®, Inc. +Copyright (c) 2007-2015 International Business Machines +Copyright (C) 2004-2006, International Business Machines +Copyright (C) 2003-2005, International Business Machines +Copyright (c) 1999-2014 International Business Machines +Copyright (c) 2003, International Business Machines +Copyright (C) 2014 International Business Machines +Copyright (c) 2001-2003 International Business Machines +Copyright (c) 2004-2011 International Business Machines +Copyright (C) 2015-2016, International Business Machines +Copyright (c) 2001-2015 International Business Machines +Copyright (C) 2003-2012, International Business Machines Corporation and COPYRIGHT AND PERMISSION NOTICE +Copyright (c) 2003 National Electronics and Computer Technology Center and others +Copyright (C) 2005-2010, International Business Machines +Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved +Copyright (C) 2004-2016 International Business Machines +Copyright (C) 1998-2013, International Business Machines +Copyright (C) 1998-2010, International Business Machines +Copyright (c) 1999-2004, International Business Machines +Copyright (C) 2002-2006 International Business Machines Corporation +Copyright (C) 1999-2006, International Business Machines +Copyright (C) 2002-2016 IBM, Inc. All Rights Reserved. +Copyright (c) 2002-2006, International Business Machines(C) Copyright IBM Corp. 1998-2007 - All Rights Reserved +Copyright (C) 1999-2003, International Business Machines +Copyright (C) 1998-2006, International Business Machines Corporation and +Copyright (C) 1998-2003, International Business Machines Corporation and +Copyright (C) 2003 - 2008, International Business Machines +Copyright (C) 1999-2008, International Business Machines +Copyright (C) 1999-2001, International Business Machines +Copyright (C) 1999-2005, International Business Machines +Copyright (C) 2016 and later: Unicode, Inc. and others. +Copyright (c) 2001-2010 IBM Corporation and others. All Rights Reserved. +Copyright (C) 1998-2005, International Business Machines Corporation and +Copyright (C) 1998-2001, International Business Machines Corporation and +Copyright (c) 2002-2005, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2000-2014, International Business Machines +Copyright (C) 1996-2013, International Business Machines +Copyright (c) 2002-2006, International Business Machines Corporation and +Copyright (c) 2004-2010, International Business Machines Corporation and +Copyright (C) 2004-2011, International Business Machines +Copyright (c) 2002-2005, International Business Machines Corporation and +Copyright (c) 2002-2014, International Business Machines +Copyright (c) 1997-2012, International Business Machines +Copyright (c) 2002-2008, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved. +Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved. +Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,and others. All Rights Reserved. +Copyright (c) 1999-2003, International Business Machines Corporation and +Copyright (c) 2003-2014, International Business Machines +Copyright (c) 2002-2010, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 1999-2010, International Business Machines Corporation and +Copyright (c) 1999-2002, International Business Machines Corporation and +Copyright (C) 2002-2003, International Business Machines +Copyright (C) 2002, International Business Machines +Copyright (c) 2007, International Business Machines Corporation and +Copyright (C) 2007, International Business Machines +Copyright (C) 2001-2006, International Business Machines +Copyright (C) 2010-2014, International Business Machines Corporation and others. +Copyright (C) 2005-2016, International Business Machines Corporation and +Copyright (C) 2015-2016, International Business Machines Corporation and +Copyright (C) 2008-2012, International Business Machines Corporation +Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved. +Copyright (c) 2014-2015 International Business Machines Corporation and others. All rights reserved. +Copyright (C) 2002-2011, International Business Machines +Copyright (c) 2003-2010, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2012 IBM Corporation and Others. All Rights Reserved. +Copyright (C) 1998-2012, International Business Machines Corporation +Copyright (c) 2009, International Business Machines Corporation and +Copyright (C) The Internet Society (2002). All Rights Reserved. +Copyright (c) 2015, International Business Machines Corporation and +Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 1998-2016, International Business Machines Corporation +Copyright (c) 2011-2016,International Business Machines +Copyright (C) 2012 International Business Machines Corporation and Others. All Rights Reserved. +Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 2011-2012,International Business Machines +Copyright (c) 2007, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2007-2007, International Business Machines(C) Copyright IBM Corp. 1998-2014 - All Rights Reserved +Copyright (C) 1998-2002, International Business Machines +Copyright (c) 2001-2007, International Business Machines Corporation and others. All Rights Reserved.(C) Copyright IBM Corp. 1998-2013 - All Rights Reserved +Copyright (C) 1998-2015, International Business Machines +Copyright (C) 2001-2014 International Business Machines +Copyright (C) 2011-2016, International Business Machines +Copyright (C) 2011-2015, International Business Machines +Copyright (c) 1999-2014, International Business Machines Corporation and +Copyright (c) 1999-2009, International Business Machines Corporation and +Copyright (c) 2010,International Business Machines +Copyright (c) 2010-2016,International Business Machines +Copyright (c) 2002-2005, International Business Machines +Copyright (C) 2000-2003, International Business Machines +Copyright (c) 2008-2014, International Business Machines Corporation and +Copyright (C) 2001 - 2005, International Business Machines +Copyright (C) 2001-2005, International Business Machines +Copyright (C) 1995-2014, International Business Machines +Copyright (c) 2000-2004 IBM, Inc. and Others. +Copyright (c) 2002-2014, International Business Machines Corporation and +Copyright (c) 2007-2013, International Business Machines Corporation and +Copyright (c) 2002-2012, International Business Machines Corporation and +Copyright (C) 2002-2012, International Business Machines +Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. +Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2009-2014, International Business Machines +Copyright (C) 2008, International Business Machines Corporation and others. +Copyright (C) 2000-2016, International Business Machines +Copyright (C) 2011-2014 International Business Machines +Copyright (C) 1997-2014, International Business Machines +Copyright (C) 1997-2013, International Business Machines +Copyright (c) 2004-2006, International Business Machines +Copyright (C) 1997-2016, International Business Machines +Copyright (C) 1997-2006, International Business Machines +Copyright (C) 1997-2011, International Business Machines Corporation and others. +Copyright (C) 1997-2013, International Business Machines Corporation and others. +Copyright (c) 2004-2015, International Business Machines +Copyright (C) 2009-2017, International Business Machines Corporation,Google, and others. All Rights Reserved. +Copyright (C) 1997-2016, International Business Machines Corporation and others. +Copyright (C) 2008-2015, International Business Machines Corporation and +Copyright (C) 1997-2015, International Business Machines Corporation and others. +Copyright (C) 2014-2016, International Business Machines Corporation and others. +Copyright (c) 2014-2016, International Business Machines +Copyright (C) 2001-2011 IBM and others. All rights reserved. +Copyright (C) 1996-2014, International Business Machines Corporation and others. +Copyright (C) 1996-2016, International Business Machines Corporation and +Copyright (C) 2009-2016, International Business Machines Corporation, +Copyright (C) 2009-2010, Google, International Business Machines Corporation and +Copyright (C) 2008-2014, Google, International Business Machines Corporation +Copyright (C) 1996-2015, International Business Machines Corporation and +Copyright (c) 1996-2015, International Business Machines Corporation and others. +Copyright (C) 2010-2012,2015 International Business Machines +Copyright (C) 2007-2015, International Business Machines +Copyright (C) 2013-2014, International Business Machines Corporation and others. +Copyright (C) 2010-2013, International Business Machines +Copyright (c) 2002-2005, International Business Machines Corporation +Copyright (C) 2001-2011,2014 IBM and others. All rights reserved. +Copyright (C) 2008-2016, International Business Machines Corporation +Copyright (C) 2004 - 2008, International Business Machines Corporation and +Copyright (C) 1997-2011,2014-2015 International Business Machines +Copyright (C) 2001-2003, International Business Machines +Copyright (C) 1999-2009, International Business Machines +Copyright (C) 2020 and later: Unicode, Inc. and others. +Copyright (c) 2002, International Business Machines Corporation and +Copyright (C) 2000-2008, International Business Machines +Copyright (C) 1998-2006, International Business Machines +Copyright (C) 1998-2001, International Business Machines Corporation +Copyright (C) 1998-2004, International Business Machines Corporation +Copyright (C) 2000, International Business Machines +Copyright (c) 1999-2016, International Business Machines Corporation and +Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 1999-2012, International Business Machines Corporation and +Copyright (C) 1998-2011, International Business Machines +Copyright (C) 2008-2014, International Business Machines Corporation and +Copyright (C) 2003-2004, International Business Machines +Copyright (c) 2003-2005, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2002-2006 IBM, Inc. All Rights Reserved. +Copyright (C) 2004-2008, International Business Machines +Copyright (c) 2002-2016 International Business Machines Corporation and +Copyright (c) 2002-2015, International Business Machines Corporation and +Copyright (C) 2002-2016, International Business Machines Corporation +Copyright (c) 2002-2010,International Business Machines +Copyright (c) 2002-2014,International Business Machines +Copyright (c) 2002-2016,International Business Machines +Copyright (C) 2016 International Business Machines Corporation +Copyright © 2019 and later: Unicode, Inc. and others. +Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 2005-2006, International Business Machines Corporation and +Copyright (c) 1997-2004, International Business Machines Corporation +Copyright (c) 2012-2016, International Business Machines Corporation +Copyright (c) 2012-2014, International Business Machines Corporation and +Copyright (c) 1997-2014, International Business Machines Corporation +Copyright (c) 1996-2016, International Business Machines Corporation and +Copyright (c) 2003-2013, International Business Machines Corporation +Copyright (c) 2003-2008, International Business Machines Corporation +Copyright (c) 1997-2015, International Business Machines Corporation +Copyright (c) 2002-2016, International Business Machines Corporation and +Copyright (c) 1997-2002, International Business Machines Corporation and +Copyright (C) 1996-2012, International Business Machines +Copyright (c) 1997-2013 International Business Machines Corporation and +Copyright (c) 2010-2012, International Business Machines Corporation and +Copyright (c) 1997-2011, International Business Machines Corporation +Copyright (c) 1997-2006, International Business Machines Corporation and +Copyright (c) 2008-2016 International Business Machines Corporation and +Copyright (c) 2008-2016, International Business Machines Corporation and +Copyright (c) 1997-2016 International Business Machines Corporation and +Copyright (c) 2007-2011, International Business Machines +Copyright (c) 2007-2010, International Business Machines +Copyright (C) 2001-2016, International Business Machines Corporation and +Copyright (C) 2001-2003, International Business Machines Corporation and +Copyright (C) 2003-2011, International Business Machines +Copyright (c) 1997-2007, International Business Machines Corporation and +Copyright (c) 1997-2015, International Business Machines +Copyright (C) 2004-2009, International Business Machines Corporation and +Copyright (C) 2004, International Business Machines Corporation and +Copyright (C) 1996-2009, International Business Machines Corporation and +Copyright (C) 1996-2006, International Business Machines Corporation and +Copyright (C) 2011-2013, International Business Machines Corporation +Copyright (C) 2000-2007, International Business Machines +Copyright (c) 2001, International Business Machines Corporation and +Copyright (C) 2012-2013, International Business Machines +Copyright (c) 2010-2016, International Business Machines Corporation and +Copyright (c) 2010-2016, International Business Machines Corporation +Copyright (c) 1997-2010, International Business Machines Corporation +Copyright (c) 1997-2003, International Business Machines +Copyright (C) 2014-2015, International Business Machines Corporation and +Copyright (c) 1997-2013, International Business Machines Corporation +Copyright (c) 1999-2016, International Business Machines +Copyright (c) 1999-2016 International Business Machines Corporation and +Copyright (c) 2016, International Business Machines Corporation and +Copyright (c) 2016, International Business Machines +Copyright (c) 2013-2016, International Business Machines Corporation +Copyright (c) 2013, International Business Machines Corporation +Copyright (C) 2013-2016, International Business Machines Corporation and +Copyright (c) 2001-2010, International Business Machines Corporation and +Copyright (C) 2014, International Business Machines Corporation and +Copyright (c) 1999-2015, International Business Machines Corporation and +Copyright (C) 2001-2016, International Business Machines orporation +Copyright (c) 2001-2008, International Business Machines Corporation and others +Copyright (C) 2003-2016, International Business Machines Corporation and +Copyright (c) 2004, International Business Machines Corporation +Copyright (C) 2001-2009, International Business Machines +Copyright (c) 2004,2011 International Business Machines +Copyright (c) 2004-2011, International Business Machines +Copyright (c) 2000-2016, International Business Machines Corporation +Copyright (c) 2001-2005, International Business Machines Corporation and +Copyright (C) 2001-2004, International Business Machines +Copyright (c) 2001-2009, International Business Machines +Copyright (c) 1997-2009, International Business Machines Corporation +Copyright (c) 1997-2013, International Business Machines +Copyright (c) 1997-2012, International Business Machines Corporation +Copyright (C) 2007-2015, International Business Machines Corporation and +Copyright (C) 2007-2011, International Business Machines Corporation and +Copyright (C) 2007, International Business Machines Corporation and +Copyright (c) 1998-2005, International Business Machines Corporation and +Copyright (c) 2002-2010, International Business Machines Corporation and +Copyright (C) 1999-2016 International Business Machines Corporation and +Copyright (c) 2004-2011, International Business Machines Corporation and +Copyright (c) 2002-2007, International Business Machines Corporation and +Copyright (C) 2003, International Business Machines Corporation and +Copyright (C) 2005-2011, International Business Machines +Copyright (C) 2011-2012, International Business Machines +Copyright (C) 2007-2012, International Business Machines +Copyright (C) 2006-2016, International Business Machines Corporation +Copyright (C) 2006-2012, International Business Machines Corporation and others. +Copyright 2007 Google Inc. All Rights Reserved. +Copyright (c) 2001-2015, International Business Machines +Copyright (C) 2006-2014, International Business Machines Corporation +Copyright (C) 2008, International Business Machines Corporation and +Copyright (C) 2009-2012, International Business Machines +Copyright (C) 2006 International Business Machines Corporation +Copyright (C) 2010-2016, International Business Machines Corporation and +Copyright (C) 2002-2014, International Business Machines Corporation and +Copyright (C) 2002-2005, International Business Machines Corporation and +Copyright (C) 2011, International Business Machines +Copyright (c) 2003-2010 International Business Machines +Copyright (C) 2003-2003, International Business Machines +Copyright (C) 1999-2016 International Business Machines Corporation +Copyright (C) 1999-2014 International Business Machines Corporation +Copyright (C) 1999-2014 International Business Machines +Copyright (C) 2002-2011, International Business Machines Corporation and others. +Copyright (C) 2002-2008, International Business Machines Corporation and others. +Copyright (C) 2002-2008 International Business Machines Corporation +Copyright (c) 2001-2005, International Business Machines +Copyright (C) 2002-2014 International Business Machines Corporation +Copyright (c) 2003-2011, International Business Machines +Copyright (C) 1998-2012, International Business Machines Corporation and +Copyright (C) 2001-2014, International Business Machines Corporation. +Copyright (C) 2001-2011, International Business Machines Corporation. +Copyright (C) 2001-2014, International Business Machines Corporation and +Copyright (C) 2001-2011, International Business Machines Corporation and +Copyright (C) 2001-2012, International Business Machines Corporation and +Copyright 2004 and onwards Google Inc. +Copyright (C) 2004-2014, International Business Machines +Copyright (C) 2006, International Business Machines +Copyright (C) 2004-2012, International Business Machines +Copyright (C) 2001-2013, International Business Machines +Copyright (C) 1998-2004, International Business Machines +Copyright (C) 2000-2013, International Business Machines +Copyright (C) 1999-2015 International Business Machines +Copyright (C) 2000-2006, International Business Machines +Copyright (C) 1999-2004, International Business Machines +Copyright (C) 2003-2007, International Business Machines +Copyright (C) 2002-2006, International Business Machines +Copyright (C) 2001-2015, International Business Machines +Copyright (c) 2001-2012, International Business Machines +Copyright (c) 2002-2004, International Business Machines +Copyright (C) 1999-2016, International Business Machines Corporation and +Copyright (c) 1996-2014, International Business Machines +Copyright (C) 1999-2016, International Business Machines Corporation +Copyright (C) 2009-2014 International Business Machines +Copyright (C) 2004-2007, International Business Machines +Copyright (c) 2001-2016, International Business Machines +Copyright (C) 2003-2009, International Business Machines +Copyright (C) 1999-2013, International Business Machines Corporation and +Copyright (C) 1999-2015, International Business Machines Corporation and +Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 2001-2016 IBM, Inc. All Rights Reserved. +Copyright (C) 1999-2016 International Business Machines +Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved. +Copyright (C) 1998-2012, International Business Machines +Copyright (C) 1991 and later: Unicode, Inc. and others. +Copyright (C) 1997-2000, International Business Machines +Copyright (c) 1999-2007, International Business Machines Corporation and +Copyright (c) 2000 IBM, Inc. and Others. +Copyright (C) 2008-2013, International Business Machines +Copyright (C) 1998-2003, 2006, International Business Machines Corporation +Copyright (c) 2002-2003,International Business Machines +Copyright (C) 2009 International Business Machines +Copyright (C) 2010-2016 International Business Machines +Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved. +Copyright (C) 1998-2008, International Business Machines +Copyright (C) 2010-2016, International Business Machines +Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved. +Copyright (C) 2008-2009, International Business Machines Corporation and +Copyright (C) 2012,2014 International Business Machines +Copyright (c) 1996-2015, International Business Machines Corporation and +Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 1999-2012, International Business Machines Corporation and +Copyright (C) 1996-2013, International Business Machines Corporation +Copyright (C) 1998-2005, International Business Machines +Copyright 2001 and onwards Google Inc. +Copyright (C) 2010-2012,2014, International Business Machines +Copyright (C) 1996-2015, International Business Machines Corporation and others. +Copyright (c) 2003-2004, International Business Machines +Copyright (C) 2000-2004, International Business Machines +Copyright (C) 2002-2013, International Business Machines +Copyright (C) 2002-2011 International Business Machines Corporation and others. All Rights Reserved. +Copyright (C) 1999-2010, International Business Machines Corporation and others. +Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved. +Copyright (c) 1996-2016, International Business Machines Corporation +Copyright (C) 1997-2010, International Business Machines + Software: opencv 4.2.0 Copyright notice: Copyright (C) 2016, NVIDIA Corporation, all rights reserved. diff --git a/cmake/external_libs/icu4c.cmake b/cmake/external_libs/icu4c.cmake new file mode 100644 index 00000000000..f54621c42ee --- /dev/null +++ b/cmake/external_libs/icu4c.cmake @@ -0,0 +1,19 @@ +set(LIB_ICU_COMMON icuuc) +set(LIB_ICU_DATA icudata) +set(LIB_ICU_I18N icui18n) +if (CMAKE_SYSTEM_NAME MATCHES "Windows") + message("icu4c thirdparty do not support windows currently.") +else() + mindspore_add_pkg(icu4c + VER 67.1 + LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N} + URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz + MD5 0c2662a2b0bc80b0eb56495205247c8f + CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json + ) + include_directories(${icu4c_INC}) + add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON}) + add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA}) + add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N}) + add_definitions(-D ENABLE_ICU4C) +endif() \ No newline at end of file diff --git a/cmake/mind_expression.cmake b/cmake/mind_expression.cmake index 242e9c21365..86337c1dd2d 100644 --- a/cmake/mind_expression.cmake +++ b/cmake/mind_expression.cmake @@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES) endif() if (ENABLE_MINDDATA) + include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake) include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake) include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake) include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake) diff --git a/cmake/package.cmake b/cmake/package.cmake index 653f26ffa5f..01f7bdabd8c 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -91,7 +91,20 @@ if (ENABLE_MINDDATA) DESTINATION ${INSTALL_LIB_DIR} COMPONENT mindspore ) - + if (CMAKE_SYSTEM_NAME MATCHES "Windows") + message("icu4c does not support windows system temporarily") + else() + file(GLOB_RECURSE ICU4C_LIB_LIST + ${icu4c_LIBPATH}/libicuuc* + ${icu4c_LIBPATH}/libicudata* + ${icu4c_LIBPATH}/libicui18n* + ) + install( + FILES ${ICU4C_LIB_LIST} + DESTINATION ${INSTALL_LIB_DIR} + COMPONENT mindspore + ) + endif() endif () if (ENABLE_CPU) diff --git a/mindspore/ccsrc/dataset/CMakeLists.txt b/mindspore/ccsrc/dataset/CMakeLists.txt index da0741e5051..9238be93f29 100644 --- a/mindspore/ccsrc/dataset/CMakeLists.txt +++ b/mindspore/ccsrc/dataset/CMakeLists.txt @@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar) if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY}) else() + set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n) target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY}) endif() target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs - mindspore::opencv_imgproc mindspore::tinyxml2) + mindspore::opencv_imgproc mindspore::tinyxml2 ${ICU_LIB}) if (ENABLE_GPUQUE) target_link_libraries(_c_dataengine PRIVATE gpu_queue ${CUDNN_PATH}/lib64/libcudnn.so diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc index 9e6940c5a36..fc01dd1c2b6 100644 --- a/mindspore/ccsrc/dataset/api/python_bindings.cc +++ b/mindspore/ccsrc/dataset/api/python_bindings.cc @@ -65,8 +65,21 @@ #include "dataset/text/kernels/jieba_tokenizer_op.h" #include "dataset/text/kernels/ngram_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h" +#include "dataset/text/kernels/wordpiece_tokenizer_op.h" #include "dataset/text/vocab.h" #include "dataset/text/kernels/lookup_op.h" + +#ifdef ENABLE_ICU4C +#include "dataset/text/kernels/basic_tokenizer_op.h" +#include "dataset/text/kernels/bert_tokenizer_op.h" +#include "dataset/text/kernels/case_fold_op.h" +#include "dataset/text/kernels/normalize_utf8_op.h" +#include "dataset/text/kernels/regex_replace_op.h" +#include "dataset/text/kernels/regex_tokenizer_op.h" +#include "dataset/text/kernels/unicode_script_tokenizer_op.h" +#include "dataset/text/kernels/whitespace_tokenizer_op.h" +#endif + #include "dataset/util/random.h" #include "mindrecord/include/shard_operator.h" #include "mindrecord/include/shard_pk_sample.h" @@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) { py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB); } -void bindTensorOps5(py::module *m) { +void bindTokenizerOps(py::module *m) { (void)py::class_>(*m, "JiebaTokenizerOp", "") .def(py::init(), py::arg("hmm_path"), py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix) @@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) { const std::string &>(), py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"), py::arg("separator")); + (void)py::class_>( + *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.") + .def(py::init &, const std::string &, const int &, const std::string &>(), + py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), + py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, + py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken)); +} + +void bindDependIcuTokenizerOps(py::module *m) { +#ifdef ENABLE_ICU4C + (void)py::class_>( + *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.") + .def(py::init<>()); + (void)py::class_>( + *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.") + .def(py::init<>()) + .def(py::init(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace); + (void)py::class_>( + *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor") + .def(py::init<>()); + (void)py::class_>( + *m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.") + .def(py::init<>()) + .def(py::init(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm); + (void)py::class_>( + *m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.") + .def(py::init(), py::arg("pattern"), py::arg("replace"), + py::arg("replace_all")); + (void)py::class_>( + *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.") + .def(py::init(), py::arg("delim_pattern"), py::arg("keep_delim_pattern")); + (void)py::class_>( + *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.") + .def(py::init(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, + py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, + py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, + py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); + (void)py::class_>(*m, "BertTokenizerOp", + "Tokenizer used for Bert text process.") + .def(py::init &, const std::string &, const int &, const std::string &, bool, bool, + NormalizeForm, bool>(), + py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), + py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, + py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), + py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, + py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, + py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, + py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); +#endif } void bindSamplerOps(py::module *m) { @@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) { .value("DE_JIEBA_HMM", JiebaMode::kHmm) .export_values(); +#ifdef ENABLE_ICU4C + (void)py::enum_(m, "NormalizeForm", py::arithmetic()) + .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) + .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) + .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) + .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) + .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) + .export_values(); +#endif + (void)py::enum_(m, "InterpolationMode", py::arithmetic()) .value("DE_INTER_LINEAR", InterpolationMode::kLinear) .value("DE_INTER_CUBIC", InterpolationMode::kCubic) @@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) { bindTensorOps2(&m); bindTensorOps3(&m); bindTensorOps4(&m); - bindTensorOps5(&m); + bindTokenizerOps(&m); bindSamplerOps(&m); bindDatasetOps(&m); bindInfoObjects(&m); bindVocabObjects(&m); bindGraphData(&m); + bindDependIcuTokenizerOps(&m); } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt index ff8fb95ea4e..8c4d19ab2c4 100644 --- a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt +++ b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt @@ -1,8 +1,21 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) +if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows")) + set(ICU_DEPEND_FILES + basic_tokenizer_op.cc + bert_tokenizer_op.cc + case_fold_op.cc + normalize_utf8_op.cc + regex_replace_op.cc + regex_tokenizer_op.cc + unicode_script_tokenizer_op.cc + whitespace_tokenizer_op.cc) +endif() add_library(text-kernels OBJECT lookup_op.cc jieba_tokenizer_op.cc unicode_char_tokenizer_op.cc ngram_op.cc + wordpiece_tokenizer_op.cc + ${ICU_DEPEND_FILES} ) diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc new file mode 100644 index 00000000000..e8f5f1f15af --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc @@ -0,0 +1,93 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/basic_tokenizer_op.h" +#include +#include +#include +#include +#include + +namespace mindspore { +namespace dataset { +const bool BasicTokenizerOp::kDefLowerCase = false; +const bool BasicTokenizerOp::kDefKeepWhitespace = false; +const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone; +const bool BasicTokenizerOp::kDefPreserveUnusedToken = true; +const char BasicTokenizerOp::kCommonPattern[] = + "[!-/]" + "|[:-@]" + "|[\\[-`]" + "|[{-~]" + "|[\\p{P}]" + "|[\\x{4E00}-\\x{9FFF}]" + "|[\\x{3400}-\\x{4DBF}]" + "|[\\x{20000}-\\x{2A6DF}]" + "|[\\x{2A700}-\\x{2B73F}]" + "|[\\x{2B740}-\\x{2B81F}]" + "|[\\x{2B820}-\\x{2CEAF}]" + "|[\\x{F900}-\\x{FAFF}]" + "|[\\x{2F800}-\\x{2FA1F}]"; +const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|"; + +BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form, + bool preserve_unused_token) + : lower_case_(lower_case), + keep_whitespace_(keep_whitespace), + preserve_unused_token_(preserve_unused_token), + case_fold_(std::make_unique()), + nfd_normalize_(std::make_unique(NormalizeForm::kNfd)), + common_normalize_(std::make_unique(normalization_form)), + replace_accent_chars_(std::make_unique("\\p{Mn}", "")), + replace_control_chars_(std::make_unique("\\p{Cc}|\\p{Cf}", " ")) { + std::string delim_pattern = std::string("\\s+|") + kCommonPattern; + std::string keep_delim_pattern; + if (keep_whitespace_) { + keep_delim_pattern = delim_pattern; + } else { + keep_delim_pattern = kCommonPattern; + } + if (preserve_unused_token_) { + keep_delim_pattern = kUnusedPattern + keep_delim_pattern; + delim_pattern = kUnusedPattern + delim_pattern; + } + regex_tokenizer_ = std::make_unique(delim_pattern, keep_delim_pattern); +} + +Status BasicTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); + } + std::shared_ptr cur_input; + std::shared_ptr processed_tensor; + if (lower_case_) { + // to lower case + RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor)); + cur_input = processed_tensor; + // strip accent characters + RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor)); + cur_input = processed_tensor; + RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor)); + } else { + RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor)); + } + // strip control characters + cur_input = processed_tensor; + RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor)); + return regex_tokenizer_->Compute(processed_tensor, output); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h new file mode 100644 index 00000000000..da79ad08766 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h @@ -0,0 +1,64 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_ +#include +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/text/kernels/case_fold_op.h" +#include "dataset/text/kernels/normalize_utf8_op.h" +#include "dataset/text/kernels/regex_replace_op.h" +#include "dataset/text/kernels/regex_tokenizer_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class BasicTokenizerOp : public TensorOp { + public: + static const bool kDefLowerCase; + static const bool kDefKeepWhitespace; + static const NormalizeForm kDefNormalizationForm; + static const bool kDefPreserveUnusedToken; + BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace, + NormalizeForm normalization_form = kDefNormalizationForm, + bool preserve_unused_token = kDefPreserveUnusedToken); + + ~BasicTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + private: + static const char kCommonPattern[]; + static const char kUnusedPattern[]; + bool lower_case_; + bool keep_whitespace_; + NormalizeForm normalization_form_; + bool preserve_unused_token_; + std::unique_ptr case_fold_; + std::unique_ptr nfd_normalize_; + std::unique_ptr common_normalize_; + std::unique_ptr replace_accent_chars_; + std::unique_ptr replace_control_chars_; + std::unique_ptr regex_tokenizer_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc new file mode 100644 index 00000000000..2b68a5accb6 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/bert_tokenizer_op.h" +namespace mindspore { +namespace dataset { +Status BertTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + std::shared_ptr basic_tensor; + RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor)); + RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output)); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h new file mode 100644 index 00000000000..61c6785f357 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_ +#include +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/text/kernels/basic_tokenizer_op.h" +#include "dataset/text/kernels/wordpiece_tokenizer_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { +class BertTokenizerOp : public TensorOp { + public: + BertTokenizerOp(const std::shared_ptr &vocab, + const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator, + const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken, + const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken, + bool lower_case = BasicTokenizerOp::kDefLowerCase, + bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, + NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm, + bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken) + : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token), + basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {} + + ~BertTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "BertTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + private: + WordpieceTokenizerOp wordpiece_tokenizer_; + BasicTokenizerOp basic_tokenizer_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc new file mode 100644 index 00000000000..d935608efda --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/case_fold_op.h" +#include +#include +#include +#include +#include + +#include "unicode/errorcode.h" +#include "unicode/normalizer2.h" +#include "unicode/utypes.h" + +namespace mindspore { +namespace dataset { + +Status CaseFoldOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + icu::ErrorCode error; + const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed."); + std::vector strs(input->Size()); + int i = 0; + for (auto iter = input->begin(); iter != input->end(); iter++) { + icu::StringByteSink sink(&strs[i++]); + nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed."); + } + *output = std::make_shared(std::move(strs), input->shape()); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h new file mode 100644 index 00000000000..d1b5ba53f1e --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_ +#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_ +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class CaseFoldOp : public TensorOp { + public: + CaseFoldOp() {} + + ~CaseFoldOp() override = default; + + void Print(std::ostream &out) const override { out << "CaseFoldOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc index 16f94096454..de1d915fbb4 100644 --- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc @@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin } Status JiebaTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); RETURN_UNEXPECTED_IF_NULL(jieba_parser_); if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { diff --git a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc index d4661ea16b2..07cf7aef5c0 100644 --- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc @@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr vocab, WordIdType default_id) : vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {} Status LookupOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); RETURN_UNEXPECTED_IF_NULL(vocab_); CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor"); std::vector word_ids; diff --git a/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc index a5a135baafe..bbe449a89a1 100644 --- a/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc @@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector &ngrams, int32_t l_len, int32_t r_le separator_(separator) {} Status NgramOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor"); std::vector offsets; // offsets for each str std::vector res; // holds the result of ngrams diff --git a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc new file mode 100644 index 00000000000..b9022865764 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc @@ -0,0 +1,75 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/normalize_utf8_op.h" +#include +#include +#include +#include +#include + +#include "unicode/errorcode.h" +#include "unicode/normalizer2.h" +#include "unicode/utypes.h" + +namespace mindspore { +namespace dataset { +const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc; +Status NormalizeUTF8Op::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + icu::ErrorCode error; + const icu::Normalizer2 *normalize = nullptr; + switch (normalize_form_) { + case NormalizeForm::kNone: { + *output = input; + return Status::OK(); + } + case NormalizeForm::kNfc: { + normalize = icu::Normalizer2::getNFCInstance(error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed"); + break; + } + case NormalizeForm::kNfkc: { + normalize = icu::Normalizer2::getNFKCInstance(error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed"); + break; + } + case NormalizeForm::kNfd: { + normalize = icu::Normalizer2::getNFDInstance(error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed"); + break; + } + case NormalizeForm::kNfkd: { + normalize = icu::Normalizer2::getNFKDInstance(error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed"); + break; + } + default: { + RETURN_STATUS_UNEXPECTED("unexpected normalize form"); + break; + } + } + std::vector strs(input->Size()); + int i = 0; + for (auto iter = input->begin(); iter != input->end(); iter++) { + icu::StringByteSink sink(&strs[i++]); + normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error); + CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed."); + } + *output = std::make_shared(std::move(strs), input->shape()); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h new file mode 100644 index 00000000000..5033f2355fb --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h @@ -0,0 +1,50 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_ +#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_ +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { +enum class NormalizeForm { + kNone = 0, + kNfc, + kNfkc, + kNfd, + kNfkd, +}; + +class NormalizeUTF8Op : public TensorOp { + public: + static const NormalizeForm kDefNormalizeForm; + explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {} + + ~NormalizeUTF8Op() override = default; + + void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + private: + NormalizeForm normalize_form_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc new file mode 100644 index 00000000000..1ce2c5ea61e --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/regex_replace_op.h" +#include +#include +#include +#include +#include + +namespace mindspore { +namespace dataset { + +Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, + std::string *out) const { + CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null"); + UErrorCode icu_error = U_ZERO_ERROR; + icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text); + matcher->reset(unicode_text); + icu::UnicodeString unicode_out; + if (replace_all_) { + unicode_out = matcher->replaceAll(replace_, icu_error); + } else { + unicode_out = matcher->replaceFirst(replace_, icu_error); + } + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed"); + unicode_out.toUTF8String(*out); + return Status::OK(); +} + +Status RegexReplaceOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + UErrorCode icu_error = U_ZERO_ERROR; + icu::RegexMatcher matcher(pattern_, 0, icu_error); + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern"); + std::vector strs(input->Size()); + int i = 0; + for (auto iter = input->begin(); iter != input->end(); iter++) { + RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i])); + } + *output = std::make_shared(std::move(strs), input->shape()); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h new file mode 100644 index 00000000000..30fae132412 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h @@ -0,0 +1,55 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_ +#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_ +#include +#include + +#include "unicode/regex.h" +#include "unicode/errorcode.h" +#include "unicode/utypes.h" + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class RegexReplaceOp : public TensorOp { + public: + RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true) + : pattern_(icu::UnicodeString::fromUTF8(pattern)), + replace_(icu::UnicodeString::fromUTF8(replace)), + replace_all_(replace_all) {} + + ~RegexReplaceOp() override = default; + + void Print(std::ostream &out) const override { out << "RegexReplaceOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + protected: + Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const; + + private: + const icu::UnicodeString pattern_; + const icu::UnicodeString replace_; + const bool replace_all_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc new file mode 100644 index 00000000000..34c06f28ea3 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc @@ -0,0 +1,103 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/regex_tokenizer_op.h" +#include +#include +#include +#include +#include + +namespace mindspore { +namespace dataset { +Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, + icu::UnicodeString *out_unicode) const { + CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input"); + int total_len = input.length(); + int end = start + len; + CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range"); + icu::UnicodeString temp; + input.extract(start, len, temp); + if (out_utf8 != nullptr) { + temp.toUTF8String(*out_utf8); + } + if (out_unicode != nullptr) { + *out_unicode = temp; + } + return Status::OK(); +} + +Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector *out_tokens) const { + UErrorCode status = U_ZERO_ERROR; + out_tokens->clear(); + icu::RegexMatcher token_matcher(delim_pattern_, 0, status); + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern"); + icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status); + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern"); + + icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text)); + token_matcher.reset(utext); + + int token_start_index = 0; + status = U_ZERO_ERROR; + while (token_matcher.find(status) && U_SUCCESS(status)) { + int deli_start_index = token_matcher.start(status); + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed"); + int deli_end_index = token_matcher.end(status); + CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed"); + + // Add non-empty token + int token_len = deli_start_index - token_start_index; + if (token_len > 0) { + std::string token; + RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token)); + out_tokens->emplace_back(std::move(token)); + } + + int delim_len = deli_end_index - deli_start_index; + if (keep_delim_ && delim_len > 0) { + icu::UnicodeString delim_str; + std::string delim_utf8_str; + RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str)); + delim_matcher.reset(delim_str); + if (delim_matcher.matches(status) && U_SUCCESS(status)) { + out_tokens->emplace_back(std::move(delim_utf8_str)); + } + } + token_start_index = deli_end_index; + } + + if (token_start_index < utext.length()) { + std::string temp; + RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp)); + out_tokens->emplace_back(std::move(temp)); + } + return Status::OK(); +} + +Status RegexTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); + } + std::string_view text; + RETURN_IF_NOT_OK(input->GetItemAt(&text, {})); + std::vector tokens; + RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens)); + *output = std::make_shared(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h new file mode 100644 index 00000000000..bcf02a4a118 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h @@ -0,0 +1,58 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_ +#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_ +#include +#include +#include + +#include "unicode/regex.h" +#include "unicode/errorcode.h" +#include "unicode/utypes.h" + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class RegexTokenizerOp : public TensorOp { + public: + RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern) + : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)), + keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)), + keep_delim_(!keep_delim_pattern.empty()) {} + + ~RegexTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + protected: + Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, + icu::UnicodeString *out_unicode = nullptr) const; + Status GetRegexTokens(const std::string &text, std::vector *out_tokens) const; + + private: + const icu::UnicodeString delim_pattern_; + const icu::UnicodeString keep_delim_pattern_; + const bool keep_delim_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_REGEX_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc index 343e0791532..063bf216308 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc @@ -28,6 +28,7 @@ namespace mindspore { namespace dataset { Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h index 53c42d599eb..01a84eca8ba 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_ -#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_ +#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_ #include #include "dataset/core/tensor.h" @@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp { } // namespace dataset } // namespace mindspore -#endif // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_ +#endif // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc new file mode 100644 index 00000000000..97a4f1333dc --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc @@ -0,0 +1,93 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/unicode_script_tokenizer_op.h" +#include +#include +#include +#include +#include + +#include "cppjieba/Unicode.hpp" +#include "unicode/errorcode.h" +#include "unicode/uchar.h" +#include "unicode/uscript.h" + +using cppjieba::DecodeRunesInString; +using cppjieba::RuneStrArray; + +namespace mindspore { +namespace dataset { + +const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false; + +Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); + } + std::string_view str; + RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); + RuneStrArray runes; + if (!DecodeRunesInString(str.data(), str.size(), runes)) { + RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); + } + + UScriptCode last_script = USCRIPT_INVALID_CODE; + icu::ErrorCode status; + int start = 0; + int len = 0; + std::vector splits; + + bool was_space = false; + for (size_t i = 0; i < runes.size(); i++) { + bool is_space = u_isUWhiteSpace(runes[i].rune); + UScriptCode script = uscript_getScript(runes[i].rune, status); + if (status.isFailure()) { + status.reset(); + script = USCRIPT_INVALID_CODE; + } + // 1) Seperate UTF-8 strings of different UScriptCode values + // (such as: "Chinese中国" should be splited to ["Chinese", "中国"]) + // 2) Seperate whitespace and non-whitespace UTF-8 strings + // (such as: " ." should be split to [" ", "."]) + if (len > 0 && (script != last_script || is_space != was_space)) { + // 3) If keep_whitespace_ is false, all the whitespace characters will be discard + if (keep_whitespace_ || !was_space) { + std::string temp(str.substr(start, len)); + splits.emplace_back(std::move(temp)); + } + start = runes[i].offset; + len = runes[i].len; + } else { + len += runes[i].len; + } + last_script = script; + was_space = is_space; + } + + if (len > 0 && (keep_whitespace_ || !was_space)) { + std::string temp(str.substr(start, len)); + splits.emplace_back(std::move(temp)); + } + // 4) If the input is empty scalar string, the output will be 1-D empty string. + if (splits.empty()) { + splits.emplace_back(""); + } + *output = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h new file mode 100644 index 00000000000..a77b0b3fa3e --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h @@ -0,0 +1,44 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_ +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class UnicodeScriptTokenizerOp : public TensorOp { + public: + static const bool kDefKeepWhitespace; + + explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {} + + ~UnicodeScriptTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + private: + bool keep_whitespace_; // If or not keep whitespace tokens +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc new file mode 100644 index 00000000000..35f3f8d0e23 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc @@ -0,0 +1,73 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/text/kernels/whitespace_tokenizer_op.h" +#include +#include +#include +#include +#include + +#include "cppjieba/Unicode.hpp" +#include "unicode/errorcode.h" +#include "unicode/uchar.h" +#include "unicode/uscript.h" + +using cppjieba::DecodeRunesInString; +using cppjieba::RuneStrArray; + +namespace mindspore { +namespace dataset { +Status WhitespaceTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); + } + std::string_view str; + RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); + + RuneStrArray runes; + if (!DecodeRunesInString(str.data(), str.size(), runes)) { + RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); + } + std::vector splits; + int start = 0; + int len = 0; + for (size_t i = 0; i < runes.size(); i++) { + if (u_isUWhiteSpace(runes[i].rune)) { + if (len > 0) { + std::string temp(str.substr(start, len)); + splits.emplace_back(std::move(temp)); + len = 0; + } + } else { + if (len == 0) { + start = runes[i].offset; + } + len += runes[i].len; + } + } + if (len > 0) { + std::string temp(str.substr(start, len)); + splits.emplace_back(std::move(temp)); + } + if (splits.empty()) { + splits.emplace_back(""); + } + *output = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h new file mode 100644 index 00000000000..6d0bab0bea6 --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_ +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/util/status.h" + +namespace mindspore { +namespace dataset { + +class WhitespaceTokenizerOp : public TensorOp { + public: + WhitespaceTokenizerOp() {} + + ~WhitespaceTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_ diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc new file mode 100644 index 00000000000..d1fd1d0dcac --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc @@ -0,0 +1,138 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dataset/text/kernels/wordpiece_tokenizer_op.h" +#include +#include + +namespace mindspore { +namespace dataset { + +const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##"; +const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100; +const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]"; + +WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr &vocab, const std::string &suffix_indicator, + const int &max_bytes_per_token, const std::string &unknown_token) + : vocab_(vocab), + suffix_indicator_(suffix_indicator), + max_bytes_per_token_(max_bytes_per_token), + unknown_token_(unknown_token) {} + +void WordpieceTokenizerOp::PadTokens(const std::vector> &tokens, const std::string &padded_str, + std::vector *out_padded_tokens, int *out_cols) const { + int rows = tokens.size(); + int max_cols = 0; + for (int i = 0; i < rows; i++) { + max_cols = std::max(max_cols, static_cast(tokens[i].size())); + } + out_padded_tokens->resize(rows * max_cols, padded_str); + for (int i = 0; i < rows; i++) { + int index = i * max_cols; + for (int j = 0; j < tokens[i].size(); j++) { + (*out_padded_tokens)[index++] = tokens[i][j]; + } + } + *out_cols = max_cols; +} + +Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, + bool *out_found, int *out_end) const { + CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range"); + *out_found = false; + for (int i = runes.size() - 1; i >= 0; i--) { + *out_end = runes[i].offset + runes[i].len; + int len = *out_end - start; + std::string word = input_token.substr(start, len); + if (start > 0) { + word = suffix_indicator_ + word; + } + WordIdType default_id = -1; + if (vocab_->Lookup(word, default_id) != default_id) { + *out_found = true; + break; + } + } + return Status::OK(); +} + +Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector *out_tokens) const { + out_tokens->clear(); + if (unknown_token_.empty()) { + out_tokens->emplace_back(input_token); + } else { + out_tokens->emplace_back(unknown_token_); + } + return Status::OK(); +} + +Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end, + std::vector *out_tokens) const { + CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range"); + std::string subword = input_token.substr(start, end - start); + if (start > 0) { + subword = suffix_indicator_ + subword; + } + out_tokens->emplace_back(subword); + return Status::OK(); +} + +Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector *out_tokens) const { + if (input_token.size() > max_bytes_per_token_) { + return FoundNoToken(input_token, out_tokens); + } + RuneStrArray runes; + if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) { + RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); + } + int end; + for (int start = 0; start < input_token.size();) { + bool found; + RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end)); + if (found) { + RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens)); + start = end; + } else { + return FoundNoToken(input_token, out_tokens); + } + } + return Status::OK(); +} + +Status WordpieceTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); + } + std::vector> out_tokens(input->Size()); + int i = 0; + for (auto iter = input->begin(); iter != input->end(); iter++) { + RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++])); + } + std::vector padded_tokens; + int cols = 0; + PadTokens(out_tokens, "", &padded_tokens, &cols); + std::vector shapes; + if (input->Rank() == 1) { + shapes.push_back(out_tokens.size()); + } + shapes.push_back(cols); + *output = std::make_shared(std::move(padded_tokens), TensorShape(shapes)); + return Status::OK(); +} + +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h new file mode 100644 index 00000000000..d74f28df47a --- /dev/null +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h @@ -0,0 +1,68 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_ +#define DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_ +#include +#include +#include +#include + +#include "cppjieba/Unicode.hpp" + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" +#include "dataset/text/vocab.h" +#include "dataset/util/status.h" + +using cppjieba::DecodeRunesInString; +using cppjieba::RuneStrArray; +namespace mindspore { +namespace dataset { + +class WordpieceTokenizerOp : public TensorOp { + public: + static const char kDefSuffixIndicator[]; + static const int kDefMaxBytesPerToken; + static const char kDefUnknownToken[]; + WordpieceTokenizerOp(const std::shared_ptr &vocab, const std::string &suffix_indicator = kDefSuffixIndicator, + const int &max_bytes_per_token = kDefMaxBytesPerToken, + const std::string &unknown_token = kDefUnknownToken); + + ~WordpieceTokenizerOp() override = default; + + void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; } + + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + protected: + void PadTokens(const std::vector> &tokens, const std::string &padded_str, + std::vector *out_padded_tokens, int *out_cols) const; + Status AddSubword(const std::string &input_token, const int start, const int end, + std::vector *out_token) const; + Status FoundNoToken(const std::string &input_token, std::vector *out_tokens) const; + Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, + int *out_end) const; + Status GetTokens(const std::string &input_token, std::vector *out_tokens) const; + + private: + const std::shared_ptr vocab_; + const std::string suffix_indicator_; + const int max_bytes_per_token_; + const std::string unknown_token_; +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_ diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py index 9da10f5447c..b98093d45ae 100644 --- a/mindspore/dataset/text/__init__.py +++ b/mindspore/dataset/text/__init__.py @@ -15,5 +15,18 @@ """ mindspore.dataset.text """ -from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram -from .utils import to_str, to_bytes, JiebaMode, Vocab +import platform +from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer +from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm + +__all__ = [ + "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", + "to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer" +] + +if platform.system().lower() != 'windows': + from .transforms import UnicodeScriptTokenizer, WhitespaceTokenizer, CaseFold, NormalizeUTF8, \ + RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer + + __all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8", + "RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"]) diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 7043de218dc..c9cfd55999d 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -17,10 +17,11 @@ c transforms for all text related operators import os import re +import platform import mindspore._c_dataengine as cde -from .utils import JiebaMode +from .utils import JiebaMode, NormalizeForm from .validators import check_lookup, check_jieba_add_dict, \ check_jieba_add_word, check_jieba_init, check_ngram @@ -174,3 +175,172 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string to Unicode characters. """ + + +class WordpieceTokenizer(cde.WordpieceTokenizerOp): + """ + Tokenize scalar token or 1-D tokens to subword tokens. + + Args + vocab(Vocab): a Vocab object. + suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). + max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). + unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, + return the token directly, else return 'unknown_token'(default '[UNK]'). + """ + + def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): + self.vocab = vocab + self.suffix_indicator = suffix_indicator + self.max_bytes_per_token = max_bytes_per_token + self.unknown_token = unknown_token + super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token) + + +if platform.system().lower() != 'windows': + class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): + """ + Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n'). + """ + + + class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): + """ + Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. + + Args: + keep_whitespace(bool, optional): If or not emit whitespace tokens (default False) + """ + + def __init__(self, keep_whitespace=False): + self.keep_whitespace = keep_whitespace + super().__init__(self.keep_whitespace) + + + class CaseFold(cde.CaseFoldOp): + """ + Apply case fold operation on utf-8 string tensor. + """ + + + DE_C_INTER_NORMALIZE_FORM = { + NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, + NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC, + NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC, + NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD, + NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD + } + + + class NormalizeUTF8(cde.NormalizeUTF8Op): + """ + Apply normalize operation on utf-8 string tensor. + + Args: + normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD". + If set "NONE", will do nothing for input string tensor. + If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC"). + See http://unicode.org/reports/tr15/ for details. + """ + + def __init__(self, normalize_form=NormalizeForm.NFKC): + self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] + super().__init__(self.normalize_form) + + + class RegexReplace(cde.RegexReplaceOp): + """ + Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'. + See http://userguide.icu-project.org/strings/regexp for support regex pattern. + + Args: + pattern(string): the regex expression patterns. + replace(string): the string to replace matched element. + replace_all(bool, optional): If False, only replace first matched element; + if True, replace all matched elements(default True). + """ + + def __init__(self, pattern, replace, replace_all=True): + self.pattern = pattern + self.replace = replace + self.replace_all = replace_all + super().__init__(self.pattern, self.replace, self.replace_all) + + + class RegexTokenizer(cde.RegexTokenizerOp): + """ + Tokenize a scalar tensor of UTF-8 string by regex expression pattern. + See http://userguide.icu-project.org/strings/regexp for support regex pattern. + + Args: + delim_pattern(string): The pattern of regex delimiters. + The original string will be split by matched elements. + keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token + if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''), + in this situation, delimiters will not kept as a output token. + """ + + def __init__(self, delim_pattern, keep_delim_pattern=''): + self.delim_pattern = delim_pattern + self.keep_delim_pattern = keep_delim_pattern + super().__init__(self.delim_pattern, self.keep_delim_pattern) + + + class BasicTokenizer(cde.BasicTokenizerOp): + """ + Tokenize a scalar tensor of UTF-8 string by specific rules. + + Args: + lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation + on input text to make the text to lower case and strip accents characters; If False, only apply + NormalizeUTF8('normalization_form' mode) operation on input text(default False). + keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). + normalization_form(Enum, optional), Used to specify a specific normlaize mode, + only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). + preserve_unused_token(bool, optional), If True, do not split special tokens like + '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). + """ + + def __init__(self, lower_case=False, keep_whitespace=False, + normalization_form=NormalizeForm.NONE, preserve_unused_token=True): + self.lower_case = lower_case + self.keep_whitespace = keep_whitespace + self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] + self.preserve_unused_token = preserve_unused_token + super().__init__(self.lower_case, self.keep_whitespace, + self.normalization_form, self.preserve_unused_token) + + + class BertTokenizer(cde.BertTokenizerOp): + """ + Tokenizer used for Bert text process. + + Args: + vocab(Vocab): a Vocab object. + suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##'). + max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100). + unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string, + return the token directly, else return 'unknown_token'(default '[UNK]'). + lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation + on input text to make the text to lower case and strip accents characters; If False, only apply + NormalizeUTF8('normalization_form' mode) operation on input text(default False). + keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False). + normalization_form(Enum, optional), Used to specify a specific normlaize mode, + only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE'). + preserve_unused_token(bool, optional), If True, do not split special tokens like + '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True). + """ + + def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, + unknown_token='[UNK]', lower_case=False, keep_whitespace=False, + normalization_form=NormalizeForm.NONE, preserve_unused_token=True): + self.vocab = vocab + self.suffix_indicator = suffix_indicator + self.max_bytes_per_token = max_bytes_per_token + self.unknown_token = unknown_token + self.lower_case = lower_case + self.keep_whitespace = keep_whitespace + self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] + self.preserve_unused_token = preserve_unused_token + super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, + self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index b590937d7d0..87be604cbbf 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -127,3 +127,11 @@ class JiebaMode(IntEnum): MIX = 0 MP = 1 HMM = 2 + + +class NormalizeForm(IntEnum): + NONE = 0 + NFC = 1 + NFKC = 2 + NFD = 3 + NFKD = 4 diff --git a/tests/ut/cpp/dataset/tokenizer_op_test.cc b/tests/ut/cpp/dataset/tokenizer_op_test.cc index a828c97f7b8..8a18f0da0cf 100644 --- a/tests/ut/cpp/dataset/tokenizer_op_test.cc +++ b/tests/ut/cpp/dataset/tokenizer_op_test.cc @@ -18,7 +18,14 @@ #include #include "common/common.h" +#include "dataset/text/kernels/basic_tokenizer_op.h" +#include "dataset/text/kernels/case_fold_op.h" +#include "dataset/text/kernels/normalize_utf8_op.h" +#include "dataset/text/kernels/regex_replace_op.h" +#include "dataset/text/kernels/regex_tokenizer_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h" +#include "dataset/text/kernels/unicode_script_tokenizer_op.h" +#include "dataset/text/kernels/whitespace_tokenizer_op.h" #include "gtest/gtest.h" #include "utils/log_adapter.h" @@ -105,3 +112,229 @@ TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { MS_LOG(INFO) << "Out tensor6: " << output->ToString(); CheckEqual(output, {0}, ""); } + +TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { + MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; + std::unique_ptr op(new WhitespaceTokenizerOp()); + std::shared_ptr input = std::make_shared("Welcome to China."); + std::shared_ptr output; + Status s = op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 3); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor1: " << output->ToString(); + CheckEqual(output, {0}, "Welcome"); + CheckEqual(output, {1}, "to"); + CheckEqual(output, {2}, "China."); + + input = std::make_shared(" hello"); + s = op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor2: " << output->ToString(); + CheckEqual(output, {0}, "hello"); + + input = std::make_shared("hello"); + s = op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor3: " << output->ToString(); + CheckEqual(output, {0}, "hello"); + + input = std::make_shared("hello "); + s = op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor4: " << output->ToString(); + CheckEqual(output, {0}, "hello"); + + input = std::make_shared(" "); + s = op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor5: " << output->ToString(); + CheckEqual(output, {0}, ""); +} + +TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { + MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; + std::unique_ptr keep_whitespace_op(new UnicodeScriptTokenizerOp(true)); + std::unique_ptr skip_whitespace_op(new UnicodeScriptTokenizerOp(false)); + + std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); + std::shared_ptr output; + Status s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 10); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor1: " << output->ToString(); + CheckEqual(output, {0}, "Welcome"); + CheckEqual(output, {1}, " "); + CheckEqual(output, {2}, "to"); + CheckEqual(output, {3}, " "); + CheckEqual(output, {4}, "China"); + CheckEqual(output, {5}, "."); + CheckEqual(output, {6}, " \n "); + CheckEqual(output, {7}, "中国"); + CheckEqual(output, {8}, "\t"); + CheckEqual(output, {9}, "北京"); + s = skip_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 6); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor2: " << output->ToString(); + CheckEqual(output, {0}, "Welcome"); + CheckEqual(output, {1}, "to"); + CheckEqual(output, {2}, "China"); + CheckEqual(output, {3}, "."); + CheckEqual(output, {4}, "中国"); + CheckEqual(output, {5}, "北京"); + + input = std::make_shared(" Welcome to 中国. "); + s = skip_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 4); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor3: " << output->ToString(); + CheckEqual(output, {0}, "Welcome"); + CheckEqual(output, {1}, "to"); + CheckEqual(output, {2}, "中国"); + CheckEqual(output, {3}, "."); + s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 8); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor4: " << output->ToString(); + CheckEqual(output, {0}, " "); + CheckEqual(output, {1}, "Welcome"); + CheckEqual(output, {2}, " "); + CheckEqual(output, {3}, "to"); + CheckEqual(output, {4}, " "); + CheckEqual(output, {5}, "中国"); + CheckEqual(output, {6}, "."); + CheckEqual(output, {7}, " "); + + input = std::make_shared("Hello"); + s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor5: " << output->ToString(); + CheckEqual(output, {0}, "Hello"); + + input = std::make_shared("H"); + s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor6: " << output->ToString(); + CheckEqual(output, {0}, "H"); + + input = std::make_shared(""); + s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor7: " << output->ToString(); + CheckEqual(output, {0}, ""); + + input = std::make_shared("Hello中国Hello世界"); + s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 4); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor8: " << output->ToString(); + CheckEqual(output, {0}, "Hello"); + CheckEqual(output, {1}, "中国"); + CheckEqual(output, {2}, "Hello"); + CheckEqual(output, {3}, "世界"); + + input = std::make_shared(" "); + s = keep_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor10: " << output->ToString(); + CheckEqual(output, {0}, " "); + input = std::make_shared(" "); + s = skip_whitespace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 1); + MS_LOG(INFO) << "Out tensor11: " << output->ToString(); + CheckEqual(output, {0}, ""); +} + +TEST_F(MindDataTestTokenizerOp, TestCaseFold) { + MS_LOG(INFO) << "Doing TestCaseFold."; + std::unique_ptr case_fold_op(new CaseFoldOp()); + std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); + std::shared_ptr output; + Status s = case_fold_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 0); + MS_LOG(INFO) << "Out tensor1: " << output->ToString(); + CheckEqual(output, {}, "welcome to china. \n 中国\t北京"); +} + +TEST_F(MindDataTestTokenizerOp, TestNormalize) { + MS_LOG(INFO) << "Doing TestNormalize."; + std::unique_ptr nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc)); + std::unique_ptr nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc)); + std::unique_ptr nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd)); + std::unique_ptr nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd)); + std::shared_ptr input = std::make_shared("ṩ"); + std::shared_ptr output; + Status s = nfc_normalize_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + MS_LOG(INFO) << "NFC str:" << output->ToString(); + + nfkc_normalize_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + MS_LOG(INFO) << "NFKC str:" << output->ToString(); + + nfd_normalize_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + MS_LOG(INFO) << "NFD str:" << output->ToString(); + + nfkd_normalize_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + MS_LOG(INFO) << "NFKD str:" << output->ToString(); +} + +TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { + MS_LOG(INFO) << "Doing TestRegexReplace."; + std::unique_ptr regex_replace_op(new RegexReplaceOp("\\s+", "_", true)); + std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); + std::shared_ptr output; + Status s = regex_replace_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output->Size(), 1); + EXPECT_EQ(output->Rank(), 0); + MS_LOG(INFO) << "Out tensor1: " << output->ToString(); + CheckEqual(output, {}, "Welcome_to_China._中国_北京"); +} + +TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { + MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; + std::unique_ptr regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "")); + std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); + std::shared_ptr output; + Status s = regex_tokenizer_op->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); +} + +TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { + MS_LOG(INFO) << "Doing TestBasicTokenizer."; + //bool lower_case, bool keep_whitespace, + // NormalizeForm normalization_form, bool preserve_unused_token + std::unique_ptr basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false)); + std::shared_ptr input = std::make_shared("Welcome to China. 中国\t北京"); + std::shared_ptr output; + Status s = basic_tokenizer->Compute(input, &output); + EXPECT_TRUE(s.IsOk()); +} \ No newline at end of file diff --git a/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt new file mode 100644 index 00000000000..6e18b19f7c2 --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt @@ -0,0 +1,7 @@ +Welcome to Beijing北京欢迎您 +長風破浪會有時,直掛雲帆濟滄海 +😀嘿嘿😃哈哈😄大笑😁嘻嘻 +明朝(1368—1644年)和清朝(1644—1911年),是中国封建王朝史上最后两个朝代 +明代(1368-1644)と清代(1644-1911)は、中国の封建王朝の歴史における最後の2つの王朝でした +명나라 (1368-1644)와 청나라 (1644-1911)는 중국 봉건 왕조의 역사에서 마지막 두 왕조였다 +Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ \ No newline at end of file diff --git a/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt new file mode 100644 index 00000000000..657b7599765 --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt @@ -0,0 +1,14 @@ +床前明月光 +疑是地上霜 +举头望明月 +低头思故乡 +I am making small mistakes during working hours +😀嘿嘿😃哈哈😄大笑😁嘻嘻 +繁體字 +unused [CLS] +unused [SEP] +unused [UNK] +unused [PAD] +unused [MASK] +12+/-28=40/-16 +Hello World! \ No newline at end of file diff --git a/tests/ut/data/dataset/testTokenizerData/normalize.txt b/tests/ut/data/dataset/testTokenizerData/normalize.txt new file mode 100644 index 00000000000..85db53b8459 --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/normalize.txt @@ -0,0 +1,6 @@ +ṩ +ḍ̇ +q̣̇ +fi +2⁵ +ẛ̣ \ No newline at end of file diff --git a/tests/ut/data/dataset/testTokenizerData/regex_replace.txt b/tests/ut/data/dataset/testTokenizerData/regex_replace.txt new file mode 100644 index 00000000000..a1342ddb0fb --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/regex_replace.txt @@ -0,0 +1,8 @@ +Hello World +Let's Go +1:hello +2:world +31:beijing +Welcome to China! + 我 不想 长大 +Welcome to Shenzhen! diff --git a/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt new file mode 100644 index 00000000000..5846355afe5 --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt @@ -0,0 +1,3 @@ +Welcome to Shenzhen! +北京欢迎您!Welcome to Beijing! +12¥+36¥=? \ No newline at end of file diff --git a/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt new file mode 100644 index 00000000000..683eebd221f --- /dev/null +++ b/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt @@ -0,0 +1,25 @@ +my +favorite +book +is +love +during +the +cholera +era +what +我 +最 +喜 +欢 +的 +书 +是 +霍 +乱 +时 +期 +的 +爱 +情 +您 \ No newline at end of file diff --git a/tests/ut/python/dataset/test_basic_tokenizer.py b/tests/ut/python/dataset/test_basic_tokenizer.py new file mode 100644 index 00000000000..45c9f94da49 --- /dev/null +++ b/tests/ut/python/dataset/test_basic_tokenizer.py @@ -0,0 +1,83 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing BasicTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as nlp + +BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" + +test_paras = [ + dict( + first=1, + last=6, + expected_tokens= + [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], + ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], + ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], + ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', + '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', + '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], + ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', + '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', + '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], + ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', + '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']] + ), + dict( + first=7, + last=7, + expected_tokens=[['this', 'is', 'a', 'funky', 'string']], + lower_case=True + ), +] + + +def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False, + normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): + dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + + basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case, + keep_whitespace=keep_whitespace, + normalization_form=normalization_form, + preserve_unused_token=preserve_unused_token) + + dataset = dataset.map(operations=basic_tokenizer) + count = 0 + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']) + logger.info("Out:", text) + logger.info("Exp:", expected_tokens[count]) + np.testing.assert_array_equal(text, expected_tokens[count]) + count = count + 1 + + +def test_basic_tokenizer(): + """ + Test BasicTokenizer + """ + for paras in test_paras: + check_basic_tokenizer(**paras) + + +if __name__ == '__main__': + test_basic_tokenizer() diff --git a/tests/ut/python/dataset/test_bert_tokenizer.py b/tests/ut/python/dataset/test_bert_tokenizer.py new file mode 100644 index 00000000000..8974d022e6e --- /dev/null +++ b/tests/ut/python/dataset/test_bert_tokenizer.py @@ -0,0 +1,183 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing BertTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as nlp + +BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt" + +vocab_bert = [ + "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", "思", "故", "乡", + "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", + "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", + "😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", + "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]" +] +pad = '' +test_paras = [ + # test chinese text + dict( + first=1, + last=4, + expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']], + [['疑'], ['是'], ['地'], ['上'], ['霜']], + [['举'], ['头'], ['望'], ['明'], ['月']], + [['低'], ['头'], ['思'], ['故'], ['乡']]], + vocab_list=vocab_bert + ), + # test english text + dict( + first=5, + last=5, + expect_str=[[['i', pad], + ["am", pad], + ['mak', '##ing'], + ['small', pad], + ['mistake', '##s'], + ['during', pad], + ['work', '##ing'], + ['hour', '##s']]], + lower_case=True, + vocab_list=vocab_bert + ), + dict( + first=5, + last=5, + expect_str=[[['I', pad], + ["am", pad], + ['mak', '##ing'], + ['small', pad], + ['mistake', '##s'], + ['during', pad], + ['work', '##ing'], + ['hour', '##s']]], + lower_case=False, + vocab_list=vocab_bert + ), + # test emoji tokens + dict( + first=6, + last=7, + expect_str=[ + [['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']], + [['繁'], ['體'], ['字']]], + normalization_form=nlp.utils.NormalizeForm.NFKC, + vocab_list=vocab_bert + ), + # test preserved tokens + dict( + first=8, + last=12, + expect_str=[ + [['[UNK]'], ['[CLS]']], + [['[UNK]'], ['[SEP]']], + [['[UNK]'], ['[UNK]']], + [['[UNK]'], ['[PAD]']], + [['[UNK]'], ['[MASK]']], + ], + lower_case=False, + vocab_list=vocab_bert, + preserve_unused_token=True, + ), + # test special symbol + dict( + first=13, + last=13, + expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]], + preserve_unused_token=True, + vocab_list=vocab_bert + ), + # test non-default parms + dict( + first=8, + last=8, + expect_str=[ + [['[UNK]'], [' '], ['[CLS]']], + ], + lower_case=False, + vocab_list=vocab_bert, + preserve_unused_token=True, + keep_whitespace=True + ), + dict( + first=8, + last=8, + expect_str=[ + [['unused'], [' '], ['[CLS]']], + ], + lower_case=False, + vocab_list=vocab_bert, + preserve_unused_token=True, + keep_whitespace=True, + unknown_token='' + ), + dict( + first=8, + last=8, + expect_str=[ + [['unused'], [' '], ['['], ['CLS'], [']']], + ], + lower_case=False, + vocab_list=vocab_bert, + preserve_unused_token=False, + keep_whitespace=True, + unknown_token='' + ), +] + + +def check_bert_tokenizer(first, last, expect_str, + vocab_list, + suffix_indicator='##', + max_bytes_per_token=100, unknown_token='[UNK]', + lower_case=False, keep_whitespace=False, + normalization_form=nlp.utils.NormalizeForm.NONE, + preserve_unused_token=False): + dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + vocab = nlp.Vocab.from_list(vocab_list) + tokenizer_op = nlp.BertTokenizer( + vocab=vocab, suffix_indicator=suffix_indicator, + max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, + lower_case=lower_case, keep_whitespace=keep_whitespace, + normalization_form=normalization_form, + preserve_unused_token=preserve_unused_token) + dataset = dataset.map(operations=tokenizer_op) + count = 0 + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']) + logger.info("Out:", text) + logger.info("Exp:", expect_str[count]) + np.testing.assert_array_equal(text, expect_str[count]) + count = count + 1 + + +def test_bert_tokenizer(): + """ + Test WordpieceTokenizer + """ + for paras in test_paras: + check_bert_tokenizer(**paras) + + +if __name__ == '__main__': + test_bert_tokenizer() diff --git a/tests/ut/python/dataset/test_tokenizer.py b/tests/ut/python/dataset/test_tokenizer.py index 3aeb035312a..2ec988d8dcd 100644 --- a/tests/ut/python/dataset/test_tokenizer.py +++ b/tests/ut/python/dataset/test_tokenizer.py @@ -15,11 +15,15 @@ """ Testing UnicodeCharTokenizer op in DE """ +import numpy as np import mindspore.dataset as ds from mindspore import log as logger import mindspore.dataset.text as nlp DATA_FILE = "../data/dataset/testTokenizerData/1.txt" +NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" +REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" +REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" def split_by_unicode_char(input_strs): @@ -48,5 +52,182 @@ def test_unicode_char_tokenizer(): assert split_by_unicode_char(input_strs) == tokens +def test_whitespace_tokenizer(): + """ + Test WhitespaceTokenizer + """ + whitespace_strs = [["Welcome", "to", "Beijing!"], + ["北京欢迎您!"], + ["我喜欢English!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = nlp.WhitespaceTokenizer() + dataset = dataset.map(operations=tokenizer) + tokens = [] + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + tokens.append(text) + logger.info("The out tokens is : {}".format(tokens)) + assert whitespace_strs == tokens + + +def test_unicode_script_tokenizer(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=False + """ + unicode_script_strs = [["Welcome", "to", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False) + dataset = dataset.map(operations=tokenizer) + + tokens = [] + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + tokens.append(text) + logger.info("The out tokens is : {}".format(tokens)) + assert unicode_script_strs == tokens + + +def test_unicode_script_tokenizer2(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=True + """ + unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [" "]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True) + dataset = dataset.map(operations=tokenizer) + tokens = [] + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + tokens.append(text) + logger.info("The out tokens is :", tokens) + assert unicode_script_strs2 == tokens + + +def test_case_fold(): + """ + Test CaseFold + """ + expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + op = nlp.CaseFold() + dataset = dataset.map(operations=op) + + lower_strs = [] + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + lower_strs.append(text) + assert lower_strs == expect_strs + + +def test_normalize_utf8(): + """ + Test NormalizeUTF8 + """ + + def normalize(normalize_form): + dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) + normalize = nlp.NormalizeUTF8(normalize_form=normalize_form) + dataset = dataset.map(operations=normalize) + out_bytes = [] + out_texts = [] + for i in dataset.create_dict_iterator(): + out_bytes.append(i['text']) + out_texts.append(nlp.to_str(i['text']).tolist()) + logger.info("The out bytes is : ", out_bytes) + logger.info("The out texts is: ", out_texts) + return out_bytes + + expect_normlize_data = [ + # NFC + [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], + # NFKC + [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'fi', b'25', b'\xe1\xb9\xa9'], + # NFD + [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], + # NFKD + [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'fi', b'25', b's\xcc\xa3\xcc\x87'] + ] + assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0] + assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1] + assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2] + assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3] + + +def test_regex_replace(): + """ + Test RegexReplace + """ + + def regex_replace(first, last, expect_str, pattern, replace): + dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + replace_op = nlp.RegexReplace(pattern, replace) + dataset = dataset.map(operations=replace_op) + out_text = [] + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + out_text.append(text) + logger.info("Out:", out_text) + logger.info("Exp:", expect_str) + assert expect_str == out_text + + regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') + regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") + regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") + regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") + + +def test_regex_tokenizer(): + """ + Test RegexTokenizer + """ + + def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): + dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern) + dataset = dataset.map(operations=tokenizer_op) + out_text = [] + count = 0 + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']).tolist() + np.testing.assert_array_equal(text, expect_str[count]) + count += 1 + out_text.append(text) + logger.info("Out:", out_text) + logger.info("Exp:", expect_str) + + regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") + regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") + regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") + regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") + regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") + regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") + + if __name__ == '__main__': test_unicode_char_tokenizer() + test_whitespace_tokenizer() + test_unicode_script_tokenizer() + test_unicode_script_tokenizer2() + test_case_fold() + test_normalize_utf8() + test_regex_replace() + test_regex_tokenizer() diff --git a/tests/ut/python/dataset/test_wordpiece_tokenizer.py b/tests/ut/python/dataset/test_wordpiece_tokenizer.py new file mode 100644 index 00000000000..79348847408 --- /dev/null +++ b/tests/ut/python/dataset/test_wordpiece_tokenizer.py @@ -0,0 +1,113 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing WordpieceTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as nlp + +WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" + +vocab_english = [ + "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the" +] + +vocab_chinese = [ + "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情' +] + +vocab_mix = vocab_chinese + vocab_english + +test_paras = [ + dict( + first=1, + last=10, + expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], + ['era'], ['[UNK]']], + vocab_list=vocab_english + ), + dict( + first=1, + last=10, + expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], + ['era'], ['what']], + vocab_list=vocab_english, + unknown_token="" + ), + dict( + first=1, + last=10, + expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], + vocab_list=vocab_english, + max_bytes_per_token=4 + ), + dict( + first=11, + last=25, + expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], + ['[UNK]']], + vocab_list=vocab_chinese, + ), + dict( + first=25, + last=25, + expect_str=[['您']], + vocab_list=vocab_chinese, + unknown_token="" + ), + dict( + first=1, + last=25, + expect_str=[ + ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'], + ['[UNK]'], + ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], + ['[UNK]']], + vocab_list=vocab_mix, + ), +] + + +def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): + dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + vocab = nlp.Vocab.from_list(vocab_list) + tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, + max_bytes_per_token=max_bytes_per_token) + dataset = dataset.map(operations=tokenizer_op) + count = 0 + for i in dataset.create_dict_iterator(): + text = nlp.to_str(i['text']) + logger.info("Out:", text) + logger.info("Exp:", expect_str[count]) + np.testing.assert_array_equal(text, expect_str[count]) + count = count + 1 + + +def test_wordpiece_tokenizer(): + """ + Test WordpieceTokenizer + """ + for paras in test_paras: + check_wordpiece_tokenizer(**paras) + + +if __name__ == '__main__': + test_wordpiece_tokenizer() diff --git a/third_party/icu4c/filter.json b/third_party/icu4c/filter.json new file mode 100644 index 00000000000..b3decad8fb4 --- /dev/null +++ b/third_party/icu4c/filter.json @@ -0,0 +1,6 @@ +{ + "strategy": "additive", + "featureFilters": { + "normalization": "include" + } +} \ No newline at end of file