Gets an array of sentences from a string.

image_pdfimage_print
   
 

#region Copyright (c) 2004, Ryan Whitaker
/*********************************************************************************
'
' Copyright (c) 2004 Ryan Whitaker
'
' This software is provided 'as-is', without any express or implied warranty. In no 
' event will the authors be held liable for any damages arising from the use of this 
' software.
' 
' Permission is granted to anyone to use this software for any purpose, including 
' commercial applications, and to alter it and redistribute it freely, subject to the 
' following restrictions:
'
' 1. The origin of this software must not be misrepresented; you must not claim that 
' you wrote the original software. If you use this software in a product, an 
' acknowledgment (see the following) in the product documentation is required.
'
' This product uses software written by the developers of NClassifier
' (http://nclassifier.sourceforge.net).  NClassifier is a .NET port of the Nick
' Lothian's Java text classification engine, Classifier4J 
' (http://classifier4j.sourceforge.net).
'
' 2. Altered source versions must be plainly marked as such, and must not be 
' misrepresented as being the original software.
'
' 3. This notice may not be removed or altered from any source distribution.
'
'********************************************************************************/
#endregion

using System;
using System.Collections;
using System.Text.RegularExpressions;

namespace NClassifier
{
  public class Utilities
  {


    /// <summary>
    /// Gets an array of sentences.
    /// </summary>
    /// <param name="input">A string that contains sentences.</param>
    /// <returns>An array of strings, each element containing a sentence.</returns>
    public static string[] GetSentences(string input)
    {
      if (input == null)
        return new string[0];
      else
      {
        // split on a ".", a "!", a "?" followed by a space or EOL
        // the original Java regex was (.|!|?)+(s|z)
        string[] result = Regex.Split(input, @"(?:.|!|?)+(?:s+|z)");

        // hacky... doing this to pass the unit tests
        ArrayList list = new ArrayList();
        foreach (string s in result)
          if (s.Length > 0)
            list.Add(s);
        return (string[])list.ToArray(typeof(string));
      }
    }
  }
}

   
     


This entry was posted in Data Types. Bookmark the permalink.